Author: Liu, Chen3 Date: 2021-02-23T17:08:05+08:00 New Revision: f8b9035aae4450b4c3c6c9e2a071ac06d520413a
URL: https://github.com/llvm/llvm-project/commit/f8b9035aae4450b4c3c6c9e2a071ac06d520413a DIFF: https://github.com/llvm/llvm-project/commit/f8b9035aae4450b4c3c6c9e2a071ac06d520413a.diff LOG: [X86] Support amx-int8 intrinsic. Adding support for intrinsics of TDPBSUD/TDPBUSD/TDPBUUD. Differential Revision: https://reviews.llvm.org/D97259 Added: Modified: clang/include/clang/Basic/BuiltinsX86_64.def clang/lib/Headers/amxintrin.h clang/test/CodeGen/X86/amx_api.c llvm/include/llvm/IR/IntrinsicsX86.td llvm/lib/Target/X86/X86ExpandPseudo.cpp llvm/lib/Target/X86/X86ISelDAGToDAG.cpp llvm/lib/Target/X86/X86InstrAMX.td llvm/lib/Target/X86/X86LowerAMXType.cpp llvm/lib/Target/X86/X86PreTileConfig.cpp llvm/lib/Target/X86/X86RegisterInfo.cpp llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll Removed: ################################################################################ diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def index 974ba35b3233..aed46b352342 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -103,6 +103,9 @@ TARGET_BUILTIN(__builtin_ia32_senduipi, "vUWi", "n", "uintr") // AMX internal builtin TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") +TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") +TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") +TARGET_BUILTIN(__builtin_ia32_tdpbuud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tilestored64_internal, "vUsUsv*zV256i", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tilezero_internal, "V256iUsUs", "n", "amx-tile") // AMX diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h index 823c7ca1f076..31a2b64b9ff2 100644 --- a/clang/lib/Headers/amxintrin.h +++ b/clang/lib/Headers/amxintrin.h @@ -238,6 +238,24 @@ _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k, return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2); } +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 +_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2); +} + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 +_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2); +} + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 +_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2); +} + static __inline__ void __DEFAULT_FN_ATTRS_INT8 _tile_stored_internal(unsigned short m, unsigned short n, void *base, __SIZE_TYPE__ stride, _tile1024i tile) { @@ -264,6 +282,27 @@ static void __tile_dpbssd(__tile1024i *dst, __tile1024i src1, src1.tile, src2.tile); } +__DEFAULT_FN_ATTRS_INT8 +static void __tile_dpbsud(__tile1024i *dst, __tile1024i src1, + __tile1024i src2) { + dst->tile = _tile_dpbsud_internal(src1.row, src2.col, src1.col, dst->tile, + src1.tile, src2.tile); +} + +__DEFAULT_FN_ATTRS_INT8 +static void __tile_dpbusd(__tile1024i *dst, __tile1024i src1, + __tile1024i src2) { + dst->tile = _tile_dpbusd_internal(src1.row, src2.col, src1.col, dst->tile, + src1.tile, src2.tile); +} + +__DEFAULT_FN_ATTRS_INT8 +static void __tile_dpbuud(__tile1024i *dst, __tile1024i src1, + __tile1024i src2) { + dst->tile = _tile_dpbuud_internal(src1.row, src2.col, src1.col, dst->tile, + src1.tile, src2.tile); +} + __DEFAULT_FN_ATTRS_TILE static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) { _tile_stored_internal(src.row, src.col, base, stride, src.tile); diff --git a/clang/test/CodeGen/X86/amx_api.c b/clang/test/CodeGen/X86/amx_api.c index 7d3aa385f6a2..7120de4c9e88 100644 --- a/clang/test/CodeGen/X86/amx_api.c +++ b/clang/test/CodeGen/X86/amx_api.c @@ -46,6 +46,27 @@ void test_tile_dpbssd(__tile1024i a, __tile1024i b, __tile1024i c) { __tile_dpbssd(&c, a, b); } +void test_tile_dpbsud(__tile1024i a, __tile1024i b, __tile1024i c) { + //CHECK-LABEL: @test_tile_dpbsud + //CHECK: call x86_amx @llvm.x86.tdpbsud.internal + //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32> + __tile_dpbsud(&c, a, b); +} + +void test_tile_dpbusd(__tile1024i a, __tile1024i b, __tile1024i c) { + //CHECK-LABEL: @test_tile_dpbusd + //CHECK: call x86_amx @llvm.x86.tdpbusd.internal + //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32> + __tile_dpbusd(&c, a, b); +} + +void test_tile_dpbuud(__tile1024i a, __tile1024i b, __tile1024i c) { + //CHECK-LABEL: @test_tile_dpbuud + //CHECK: call x86_amx @llvm.x86.tdpbuud.internal + //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32> + __tile_dpbuud(&c, a, b); +} + void test_tile_stored(__tile1024i c) { //CHECK-LABEL: @test_tile_stored //CHECK: {{%.*}} = bitcast <256 x i32> {{%.*}} to x86_amx diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index bba12139976e..2c1202cc2a05 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5053,6 +5053,24 @@ let TargetPrefix = "x86" in { [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, llvm_x86amx_ty, llvm_x86amx_ty], []>; + def int_x86_tdpbsud_internal : + GCCBuiltin<"__builtin_ia32_tdpbsud_internal">, + Intrinsic<[llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, + llvm_x86amx_ty, llvm_x86amx_ty, + llvm_x86amx_ty], []>; + def int_x86_tdpbusd_internal : + GCCBuiltin<"__builtin_ia32_tdpbusd_internal">, + Intrinsic<[llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, + llvm_x86amx_ty, llvm_x86amx_ty, + llvm_x86amx_ty], []>; + def int_x86_tdpbuud_internal : + GCCBuiltin<"__builtin_ia32_tdpbuud_internal">, + Intrinsic<[llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, + llvm_x86amx_ty, llvm_x86amx_ty, + llvm_x86amx_ty], []>; def int_x86_tilestored64_internal : GCCBuiltin<"__builtin_ia32_tilestored64_internal">, Intrinsic<[], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 395f437bb648..fc4e9eb4a4bb 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -467,11 +467,22 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MI.setDesc(TII->get(X86::TILELOADD)); return true; } - case X86::PTDPBSSDV: { + case X86::PTDPBSSDV: + case X86::PTDPBSUDV: + case X86::PTDPBUSDV: + case X86::PTDPBUUDV: { MI.untieRegOperand(4); for (unsigned i = 3; i > 0; --i) MI.RemoveOperand(i); - MI.setDesc(TII->get(X86::TDPBSSD)); + unsigned Opc; + switch (Opcode) { + case X86::PTDPBSSDV: Opc = X86::TDPBSSD; break; + case X86::PTDPBSUDV: Opc = X86::TDPBSUD; break; + case X86::PTDPBUSDV: Opc = X86::TDPBUSD; break; + case X86::PTDPBUUDV: Opc = X86::TDPBUUD; break; + default: llvm_unreachable("Impossible Opcode!"); + } + MI.setDesc(TII->get(Opc)); MI.tieOperands(0, 1); return true; } diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 5c473408c9ec..bebd430af6a7 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4621,11 +4621,22 @@ void X86DAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, CNode); return; } - case Intrinsic::x86_tdpbssd_internal: { + + case Intrinsic::x86_tdpbssd_internal: + case Intrinsic::x86_tdpbsud_internal: + case Intrinsic::x86_tdpbusd_internal: + case Intrinsic::x86_tdpbuud_internal: { if (!Subtarget->hasAMXTILE()) break; SDValue Chain = Node->getOperand(0); - unsigned Opc = X86::PTDPBSSDV; + unsigned Opc; + switch (IntNo) { + case Intrinsic::x86_tdpbssd_internal: Opc = X86::PTDPBSSDV; break; + case Intrinsic::x86_tdpbsud_internal: Opc = X86::PTDPBSUDV; break; + case Intrinsic::x86_tdpbusd_internal: Opc = X86::PTDPBUSDV; break; + case Intrinsic::x86_tdpbuud_internal: Opc = X86::PTDPBUUDV; break; + default: llvm_unreachable("Impossible intrinsic"); + } SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Node->getOperand(4), diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index 209ebd4b3de3..b93aab30161d 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -92,10 +92,20 @@ let Predicates = [HasAMXINT8, In64BitMode] in { } // Pseduo instruction for RA. - let Constraints = "$src4 = $dst" in - def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6), []>; + let Constraints = "$src4 = $dst" in { + def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, + GR16:$src2, GR16:$src3, TILE:$src4, + TILE:$src5, TILE:$src6), []>; + def PTDPBSUDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, + GR16:$src2, GR16:$src3, TILE:$src4, + TILE:$src5, TILE:$src6), []>; + def PTDPBUSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, + GR16:$src2, GR16:$src3, TILE:$src4, + TILE:$src5, TILE:$src6), []>; + def PTDPBUUDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, + GR16:$src2, GR16:$src3, TILE:$src4, + TILE:$src5, TILE:$src6), []>; + } let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index 85166decd8cd..3fdcf1607d22 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -67,7 +67,10 @@ static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) { } // a * b + c // The shape depends on which operand. - case Intrinsic::x86_tdpbssd_internal: { + case Intrinsic::x86_tdpbssd_internal: + case Intrinsic::x86_tdpbsud_internal: + case Intrinsic::x86_tdpbusd_internal: + case Intrinsic::x86_tdpbuud_internal: { switch (OpNo) { case 3: Row = II->getArgOperand(0); diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp index 432e1fe2b694..90b421b44d7a 100644 --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -155,6 +155,9 @@ static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) { llvm_unreachable("Unexpected machine instruction on tile"); case X86::PTILELOADDV: case X86::PTDPBSSDV: + case X86::PTDPBSUDV: + case X86::PTDPBUSDV: + case X86::PTDPBUUDV: case X86::PTILEZEROV: MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1)); MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2)); @@ -249,6 +252,9 @@ static bool isAMXInstruction(MachineBasicBlock::iterator MII) { case X86::PTILELOADDV: case X86::PTILESTOREDV: case X86::PTDPBSSDV: + case X86::PTDPBSUDV: + case X86::PTDPBUSDV: + case X86::PTDPBUUDV: case X86::PTILEZEROV: return true; } diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 261293c785a1..00bb73fa2d9a 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -884,6 +884,9 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, // We only collect the tile shape that is defined. case X86::PTILELOADDV: case X86::PTDPBSSDV: + case X86::PTDPBSUDV: + case X86::PTDPBUSDV: + case X86::PTDPBUUDV: case X86::PTILEZEROV: MachineOperand &MO1 = MI->getOperand(1); MachineOperand &MO2 = MI->getOperand(2); diff --git a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll index 501bde029dc1..ebb6ee5bc231 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll @@ -19,6 +19,9 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { ; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm1 ; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm2 ; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 +; CHECK-NEXT: tdpbsud %tmm2, %tmm1, %tmm0 +; CHECK-NEXT: tdpbusd %tmm2, %tmm1, %tmm0 +; CHECK-NEXT: tdpbuud %tmm2, %tmm1, %tmm0 ; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx) ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper @@ -26,8 +29,11 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) %b = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) - %d = call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d) + %d0 = call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) + %d1 = call x86_amx @llvm.x86.tdpbsud.internal(i16 8, i16 8, i16 8, x86_amx %d0, x86_amx %a, x86_amx %b) + %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b) + %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d3) ret void } @@ -35,4 +41,7 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { declare x86_amx @llvm.x86.tilezero.internal(i16, i16) declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits