LuoYuanke updated this revision to Diff 311375. LuoYuanke added a comment. a. Change tilezero back to be chained.
1. It avoid copy if several tilezero are same. Copy is expensive in AMX. 2. To keep the original order of amx intrinsics. b. Refactor __tilezero interface. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D92837/new/ https://reviews.llvm.org/D92837 Files: clang/include/clang/Basic/BuiltinsX86_64.def clang/lib/Headers/amxintrin.h clang/test/CodeGen/X86/amx_api.c llvm/include/llvm/IR/IntrinsicsX86.td llvm/lib/Target/X86/X86ExpandPseudo.cpp llvm/lib/Target/X86/X86ISelDAGToDAG.cpp llvm/lib/Target/X86/X86InstrAMX.td llvm/lib/Target/X86/X86PreTileConfig.cpp llvm/lib/Target/X86/X86RegisterInfo.cpp llvm/lib/Target/X86/X86RegisterInfo.td llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
Index: llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -verify-machineinstrs | FileCheck %s + +define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { +; CHECK-LABEL: test_amx: +; CHECK: # %bb.0: +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, %ax +; CHECK-NEXT: tilezero %tmm0 +; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm1 +; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm2 +; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 +; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx) +; CHECK-NEXT: tilerelease +; CHECK-NEXT: retq + %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) + %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) + %b = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) + %d = call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d) + + ret void +} + +declare x86_amx @llvm.x86.tilezero.internal(i16, i16) +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) Index: llvm/lib/Target/X86/X86RegisterInfo.td =================================================================== --- llvm/lib/Target/X86/X86RegisterInfo.td +++ llvm/lib/Target/X86/X86RegisterInfo.td @@ -639,7 +639,7 @@ let CopyCost = -1 in // Don't allow copying of tile registers def TILE : RegisterClass<"X86", [x86amx], 8192, (sequence "TMM%u", 0, 7)> {let Size = 8192;} -def TILECFG : RegisterClass<"X86", [v512i1], 512, (add TMMCFG)> { +def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> { let CopyCost = -1; // Don't allow copying of tile config registers. let isAllocatable = 1; let Size = 512; Index: llvm/lib/Target/X86/X86RegisterInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86RegisterInfo.cpp +++ llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -873,6 +873,7 @@ // We only collect the tile shape that is defined. case X86::PTILELOADDV: case X86::PTDPBSSDV: + case X86::PTILEZEROV: MachineOperand &MO1 = MI->getOperand(1); MachineOperand &MO2 = MI->getOperand(2); ShapeT Shape(&MO1, &MO2, MRI); Index: llvm/lib/Target/X86/X86PreTileConfig.cpp =================================================================== --- llvm/lib/Target/X86/X86PreTileConfig.cpp +++ llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -132,6 +132,7 @@ llvm_unreachable("Unexpected machine instruction on tile"); case X86::PTILELOADDV: case X86::PTDPBSSDV: + case X86::PTILEZEROV: MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1)); MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2)); ShapeT Shape(&MO1, &MO2, MRI); @@ -230,6 +231,7 @@ case X86::PTILELOADDV: case X86::PTILESTOREDV: case X86::PTDPBSSDV: + case X86::PTILEZEROV: unsigned NumOperands = MI.getNumOperands(); MI.RemoveOperand(NumOperands - 1); MI.addOperand(MF, MachineOperand::CreateReg(CFG, false)); Index: llvm/lib/Target/X86/X86InstrAMX.td =================================================================== --- llvm/lib/Target/X86/X86InstrAMX.td +++ llvm/lib/Target/X86/X86InstrAMX.td @@ -62,6 +62,9 @@ def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1, GR16:$src2, opaquemem:$src3, TILE:$src4, TILECFG:$cfg), []>; + def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, + GR16:$src2, + TILECFG:$cfg), []>; let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4601,6 +4601,7 @@ case Intrinsic::x86_tdpbssd_internal: { if (!Subtarget->hasAMXTILE()) break; + SDValue Chain = Node->getOperand(0); unsigned Opc = X86::PTDPBSSDV; SDValue CFG = CurDAG->getRegister(0, MVT::v512i1); SDValue Ops[] = {Node->getOperand(2), @@ -4609,7 +4610,20 @@ Node->getOperand(5), Node->getOperand(6), Node->getOperand(7), - CFG}; + CFG, + Chain}; + MachineSDNode *CNode = + CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); + ReplaceNode(Node, CNode); + return; + } + case Intrinsic::x86_tilezero_internal: { + if (!Subtarget->hasAMXTILE()) + break; + unsigned Opc = X86::PTILEZEROV; + SDValue Chain = Node->getOperand(0); + SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); + SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain}; MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); ReplaceNode(Node, CNode); Index: llvm/lib/Target/X86/X86ExpandPseudo.cpp =================================================================== --- llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -494,6 +494,12 @@ MI.setDesc(TII->get(X86::TILESTORED)); return true; } + case X86::PTILEZEROV: { + for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg + MI.RemoveOperand(i); + MI.setDesc(TII->get(X86::TILEZERO)); + return true; + } } llvm_unreachable("Previous switch has a fallthrough?"); } Index: llvm/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsX86.td +++ llvm/include/llvm/IR/IntrinsicsX86.td @@ -5058,6 +5058,10 @@ GCCBuiltin<"__builtin_ia32_tilestored64_internal">, Intrinsic<[], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty, llvm_x86amx_ty], []>; + def int_x86_tilezero_internal : + GCCBuiltin<"__builtin_ia32_tilezero_internal">, + Intrinsic<[llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty], + []>; } //===----------------------------------------------------------------------===// Index: clang/test/CodeGen/X86/amx_api.c =================================================================== --- clang/test/CodeGen/X86/amx_api.c +++ clang/test/CodeGen/X86/amx_api.c @@ -52,3 +52,10 @@ //CHECK-NEXT: call void @llvm.x86.tilestored64.internal __tile_stored(buf, STRIDE, c); } + +void test_tile_zero(__tile1024i c) { + //CHECK-LABEL: @test_tile_zero + //CHECK: call x86_amx @llvm.x86.tilezero.internal + //CHECK-NEXT bitcast x86_amx {{%.*}} to <256 x i32> + __tile_zero(&c); +} Index: clang/lib/Headers/amxintrin.h =================================================================== --- clang/lib/Headers/amxintrin.h +++ clang/lib/Headers/amxintrin.h @@ -251,7 +251,7 @@ _tile1024i tile; } __tile1024i; -__DEFAULT_FN_ATTRS_INT8 +__DEFAULT_FN_ATTRS_TILE static void __tile_loadd(__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) { dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride); @@ -264,10 +264,15 @@ src1.tile, src2.tile); } -__DEFAULT_FN_ATTRS_INT8 +__DEFAULT_FN_ATTRS_TILE static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) { _tile_stored_internal(src.row, src.col, base, stride, src.tile); } +__DEFAULT_FN_ATTRS_TILE +static void __tile_zero(__tile1024i *dst) { + dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col); +} + #endif /* __x86_64__ */ #endif /* __AMXINTRIN_H */ Index: clang/include/clang/Basic/BuiltinsX86_64.def =================================================================== --- clang/include/clang/Basic/BuiltinsX86_64.def +++ clang/include/clang/Basic/BuiltinsX86_64.def @@ -104,6 +104,7 @@ TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tilestored64_internal, "vUsUsv*zV256i", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_tilezero_internal, "V256iUsUs", "n", "amx-tile") // AMX TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits