LuoYuanke updated this revision to Diff 311375.
LuoYuanke added a comment.

a. Change tilezero back to be chained.

1. It avoid copy if several tilezero are same. Copy is expensive in AMX.
2. To keep the original order of amx intrinsics.

b. Refactor __tilezero interface.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D92837/new/

https://reviews.llvm.org/D92837

Files:
  clang/include/clang/Basic/BuiltinsX86_64.def
  clang/lib/Headers/amxintrin.h
  clang/test/CodeGen/X86/amx_api.c
  llvm/include/llvm/IR/IntrinsicsX86.td
  llvm/lib/Target/X86/X86ExpandPseudo.cpp
  llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
  llvm/lib/Target/X86/X86InstrAMX.td
  llvm/lib/Target/X86/X86PreTileConfig.cpp
  llvm/lib/Target/X86/X86RegisterInfo.cpp
  llvm/lib/Target/X86/X86RegisterInfo.td
  llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll

Index: llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -verify-machineinstrs | FileCheck %s
+
+define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
+; CHECK-LABEL: test_amx:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, %ax
+; CHECK-NEXT:    tilezero %tmm0
+; CHECK-NEXT:    tileloadd (%rsi,%rdx), %tmm1
+; CHECK-NEXT:    tileloadd (%rsi,%rdx), %tmm2
+; CHECK-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
+; CHECK-NEXT:    tilestored %tmm0, (%rdi,%rdx)
+; CHECK-NEXT:    tilerelease
+; CHECK-NEXT:    retq
+  %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
+  %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
+  %b = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
+  %d = call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b)
+  call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d)
+
+  ret void
+}
+
+declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
+declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
+declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
Index: llvm/lib/Target/X86/X86RegisterInfo.td
===================================================================
--- llvm/lib/Target/X86/X86RegisterInfo.td
+++ llvm/lib/Target/X86/X86RegisterInfo.td
@@ -639,7 +639,7 @@
 let CopyCost = -1 in // Don't allow copying of tile registers
 def TILE : RegisterClass<"X86", [x86amx], 8192,
                          (sequence "TMM%u", 0, 7)> {let Size = 8192;}
-def TILECFG : RegisterClass<"X86", [v512i1], 512, (add TMMCFG)> {
+def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> {
   let CopyCost = -1;  // Don't allow copying of tile config registers.
   let isAllocatable = 1;
   let Size = 512;
Index: llvm/lib/Target/X86/X86RegisterInfo.cpp
===================================================================
--- llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -873,6 +873,7 @@
   // We only collect the tile shape that is defined.
   case X86::PTILELOADDV:
   case X86::PTDPBSSDV:
+  case X86::PTILEZEROV:
     MachineOperand &MO1 = MI->getOperand(1);
     MachineOperand &MO2 = MI->getOperand(2);
     ShapeT Shape(&MO1, &MO2, MRI);
Index: llvm/lib/Target/X86/X86PreTileConfig.cpp
===================================================================
--- llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -132,6 +132,7 @@
     llvm_unreachable("Unexpected machine instruction on tile");
   case X86::PTILELOADDV:
   case X86::PTDPBSSDV:
+  case X86::PTILEZEROV:
     MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1));
     MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2));
     ShapeT Shape(&MO1, &MO2, MRI);
@@ -230,6 +231,7 @@
       case X86::PTILELOADDV:
       case X86::PTILESTOREDV:
       case X86::PTDPBSSDV:
+      case X86::PTILEZEROV:
         unsigned NumOperands = MI.getNumOperands();
         MI.RemoveOperand(NumOperands - 1);
         MI.addOperand(MF, MachineOperand::CreateReg(CFG, false));
Index: llvm/lib/Target/X86/X86InstrAMX.td
===================================================================
--- llvm/lib/Target/X86/X86InstrAMX.td
+++ llvm/lib/Target/X86/X86InstrAMX.td
@@ -62,6 +62,9 @@
     def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
                                             GR16:$src2, opaquemem:$src3,
                                             TILE:$src4, TILECFG:$cfg), []>;
+    def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+                                                     GR16:$src2,
+                                                     TILECFG:$cfg), []>;
 
     let usesCustomInserter = 1 in {
       // Pseudo instructions, using immediates instead of tile registers.
Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4601,6 +4601,7 @@
     case Intrinsic::x86_tdpbssd_internal: {
       if (!Subtarget->hasAMXTILE())
         break;
+      SDValue Chain = Node->getOperand(0);
       unsigned Opc = X86::PTDPBSSDV;
       SDValue CFG = CurDAG->getRegister(0, MVT::v512i1);
       SDValue Ops[] = {Node->getOperand(2),
@@ -4609,7 +4610,20 @@
                        Node->getOperand(5),
                        Node->getOperand(6),
                        Node->getOperand(7),
-                       CFG};
+                       CFG,
+                       Chain};
+      MachineSDNode *CNode =
+          CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
+      ReplaceNode(Node, CNode);
+      return;
+    }
+    case Intrinsic::x86_tilezero_internal: {
+      if (!Subtarget->hasAMXTILE())
+        break;
+      unsigned Opc = X86::PTILEZEROV;
+      SDValue Chain = Node->getOperand(0);
+      SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+      SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain};
       MachineSDNode *CNode =
           CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
       ReplaceNode(Node, CNode);
Index: llvm/lib/Target/X86/X86ExpandPseudo.cpp
===================================================================
--- llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -494,6 +494,12 @@
     MI.setDesc(TII->get(X86::TILESTORED));
     return true;
   }
+  case X86::PTILEZEROV: {
+    for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg
+      MI.RemoveOperand(i);
+    MI.setDesc(TII->get(X86::TILEZERO));
+    return true;
+  }
   }
   llvm_unreachable("Previous switch has a fallthrough?");
 }
Index: llvm/include/llvm/IR/IntrinsicsX86.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsX86.td
+++ llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5058,6 +5058,10 @@
               GCCBuiltin<"__builtin_ia32_tilestored64_internal">,
               Intrinsic<[], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty,
                              llvm_i64_ty, llvm_x86amx_ty], []>;
+  def int_x86_tilezero_internal :
+              GCCBuiltin<"__builtin_ia32_tilezero_internal">,
+              Intrinsic<[llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty],
+                        []>;
 }
 
 //===----------------------------------------------------------------------===//
Index: clang/test/CodeGen/X86/amx_api.c
===================================================================
--- clang/test/CodeGen/X86/amx_api.c
+++ clang/test/CodeGen/X86/amx_api.c
@@ -52,3 +52,10 @@
   //CHECK-NEXT: call void @llvm.x86.tilestored64.internal
   __tile_stored(buf, STRIDE, c);
 }
+
+void test_tile_zero(__tile1024i c) {
+  //CHECK-LABEL: @test_tile_zero
+  //CHECK: call x86_amx @llvm.x86.tilezero.internal
+  //CHECK-NEXT bitcast x86_amx {{%.*}} to <256 x i32>
+  __tile_zero(&c);
+}
Index: clang/lib/Headers/amxintrin.h
===================================================================
--- clang/lib/Headers/amxintrin.h
+++ clang/lib/Headers/amxintrin.h
@@ -251,7 +251,7 @@
   _tile1024i tile;
 } __tile1024i;
 
-__DEFAULT_FN_ATTRS_INT8
+__DEFAULT_FN_ATTRS_TILE
 static void __tile_loadd(__tile1024i *dst, const void *base,
                          __SIZE_TYPE__ stride) {
   dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
@@ -264,10 +264,15 @@
                                     src1.tile, src2.tile);
 }
 
-__DEFAULT_FN_ATTRS_INT8
+__DEFAULT_FN_ATTRS_TILE
 static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
   _tile_stored_internal(src.row, src.col, base, stride, src.tile);
 }
 
+__DEFAULT_FN_ATTRS_TILE
+static void __tile_zero(__tile1024i *dst) {
+  dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
+}
+
 #endif /* __x86_64__ */
 #endif /* __AMXINTRIN_H */
Index: clang/include/clang/Basic/BuiltinsX86_64.def
===================================================================
--- clang/include/clang/Basic/BuiltinsX86_64.def
+++ clang/include/clang/Basic/BuiltinsX86_64.def
@@ -104,6 +104,7 @@
 TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
 TARGET_BUILTIN(__builtin_ia32_tilestored64_internal, "vUsUsv*zV256i", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tilezero_internal, "V256iUsUs", "n", "amx-tile")
 // AMX
 TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to