[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)

Pierre van Houtryve via cfe-commits Thu, 15 Feb 2024 00:01:05 -0800

================
@@ -2378,6 +2456,221 @@ bool 
SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
   return Changed;
 }
 
+bool SIGfx6CacheControl ::handleNonAtomicForPreciseMemory(
+    MachineBasicBlock::iterator &MI) {
+  assert(MI->mayLoadOrStore());
+
+  MachineInstr &Inst = *MI;
+  AMDGPU::Waitcnt Wait;
+
+  if (TII->isSMRD(Inst)) { // scalar
+    if (Inst.mayStore())
+      return false;
+    Wait.DsCnt = 0;                   // LgkmCnt
+  } else {                            // vector
+    if (Inst.mayLoad()) {             // vector load
+      if (TII->isVMEM(Inst)) {        // VMEM load
+        Wait.LoadCnt = 0;             // VmCnt
+      } else if (TII->isFLAT(Inst)) { // Flat load
+        Wait.LoadCnt = 0;             // VmCnt
+        Wait.DsCnt = 0;               // LgkmCnt
+      } else {                        // LDS load
+        Wait.DsCnt = 0;               // LgkmCnt
+      }
+    } else {                          // vector store
+      if (TII->isVMEM(Inst)) {        // VMEM store
+        Wait.LoadCnt = 0;             // VmCnt
+      } else if (TII->isFLAT(Inst)) { // Flat store
+        Wait.LoadCnt = 0;             // VmCnt
+        Wait.DsCnt = 0;               // LgkmCnt
+      } else {
+        Wait.DsCnt = 0; // LDS store; LgkmCnt
+      }
+    }
+  }
+
+  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+  MachineBasicBlock &MBB = *MI->getParent();
+  BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+  --MI;
+  return true;
+}
+
+bool SIGfx6CacheControl ::handleAtomicForPreciseMemory(
+    MachineBasicBlock::iterator &MI, bool ret) {
+  assert(MI->mayLoadOrStore());
+
+  AMDGPU::Waitcnt Wait;
+
+  Wait.LoadCnt = 0; // VmCnt
+  Wait.DsCnt = 0;   // LgkmCnt
+
+  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+  MachineBasicBlock &MBB = *MI->getParent();
+  BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+  --MI;
+  return true;
+}
+
+bool SIGfx10CacheControl ::handleNonAtomicForPreciseMemory(
+    MachineBasicBlock::iterator &MI) {
+  assert(MI->mayLoadOrStore());
+
+  MachineInstr &Inst = *MI;
+  AMDGPU::Waitcnt Wait;
+
+  bool BuildWaitCnt = true;
+  bool BuildVsCnt = false;
+
+  if (TII->isSMRD(Inst)) { // scalar
+    if (Inst.mayStore())
+      return false;
+    Wait.DsCnt = 0;                   // LgkmCnt
+  } else {                            // vector
+    if (Inst.mayLoad()) {             // vector load
+      if (TII->isVMEM(Inst)) {        // VMEM load
+        Wait.LoadCnt = 0;             // VmCnt
+      } else if (TII->isFLAT(Inst)) { // Flat load
+        Wait.LoadCnt = 0;             // VmCnt
+        Wait.DsCnt = 0;               // LgkmCnt
+      } else {                        // LDS load
+        Wait.DsCnt = 0;               // LgkmCnt
+      }
+    }
+
+    // For some vector instructions, mayLoad() and mayStore() can be both true.
+    if (Inst.mayStore()) {     // vector store; an instruction can be both
+                               // load/store
+      if (TII->isVMEM(Inst)) { // VMEM store
+        if (!Inst.mayLoad())
+          BuildWaitCnt = false;
+        BuildVsCnt = true;
+      } else if (TII->isFLAT(Inst)) { // Flat store
+        Wait.DsCnt = 0;               // LgkmCnt
+        BuildVsCnt = true;
+      } else {
+        Wait.DsCnt = 0; // LDS store; LgkmCnt
+      }
+    }
+  }
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  if (BuildWaitCnt) {
+    unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+    BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+    --MI;
+  }
+
+  if (BuildVsCnt) {
+    BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+        .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+        .addImm(0);
+    --MI;
+  }
+  return true;
+}
+
+bool SIGfx10CacheControl ::handleAtomicForPreciseMemory(
+    MachineBasicBlock::iterator &MI, bool ret) {
+  assert(MI->mayLoadOrStore());
+
+  AMDGPU::Waitcnt Wait;
+
+  Wait.DsCnt = 0; // LgkmCnt
+  if (ret)
+    Wait.LoadCnt = 0; // VmCnt
+
+  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+  MachineBasicBlock &MBB = *MI->getParent();
+  BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+  --MI;
+  if (!ret) {
+    BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+        .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+        .addImm(0);
+    --MI;
+  }
+  return true;
+}
+
+bool SIGfx12CacheControl ::handleNonAtomicForPreciseMemory(
+    MachineBasicBlock::iterator &MI) {
+  assert(MI->mayLoadOrStore());
+
+  MachineInstr &Inst = *MI;
+  unsigned WaitType = 0;
+  // For some vector instructions, mayLoad() and mayStore() can be both true.
+  bool LoadAndStore = false;
+
+  if (TII->isSMRD(Inst)) { // scalar
+    if (Inst.mayStore())
+      return false;
+
+    WaitType = AMDGPU::S_WAIT_KMCNT;
+  } else { // vector
+    if (Inst.mayLoad() && Inst.mayStore()) {
+      WaitType = AMDGPU::S_WAIT_LOADCNT;
+      LoadAndStore = true;
+    } else if (Inst.mayLoad()) { // vector load
+      if (TII->isVMEM(Inst)) {   // VMEM load
+        WaitType = AMDGPU::S_WAIT_LOADCNT;
+      } else if (TII->isFLAT(Inst)) { // Flat load
+        WaitType = AMDGPU::S_WAIT_LOADCNT_DSCNT;
+      } else { // LDS load
+        WaitType = AMDGPU::S_WAIT_DSCNT;
+      }
+    } else {                   // vector store
+      if (TII->isVMEM(Inst)) { // VMEM store
+        WaitType = AMDGPU::S_WAIT_STORECNT;
+      } else if (TII->isFLAT(Inst)) { // Flat store
+        WaitType = AMDGPU::S_WAIT_STORECNT_DSCNT;
+      } else {
+        WaitType = AMDGPU::S_WAIT_DSCNT;
+      }
+    }
+  }
+
+  assert(WaitType != 0);
+
+  MachineBasicBlock &MBB = *MI->getParent();
+
+  unsigned Enc = 0;
+  if (WaitType == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
+    AMDGPU::Waitcnt Wait;
+    Wait.DsCnt = 0;
+    Wait.LoadCnt = 0;
+    Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
+  } else if (WaitType == AMDGPU::S_WAIT_STORECNT_DSCNT) {
+    AMDGPU::Waitcnt Wait;
+    Wait.DsCnt = 0;
+    Wait.StoreCnt = 0;
+    Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
+  }
+
+  BuildMI(MBB, ++MI, DebugLoc(), TII->get(WaitType)).addImm(Enc);
+  --MI;
+  if (LoadAndStore) {
+    BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAIT_STORECNT))
+        .addImm(Enc);
+    --MI;
+  }
+  return true;
+}
+
+bool SIGfx12CacheControl ::handleAtomicForPreciseMemory(
+    MachineBasicBlock::iterator &MI, bool ret) {
+  assert(MI->mayLoadOrStore());
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  if (ret) {
+    BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT)).addImm(0);
+  } else {
+    BuildMI(MBB, ++MI, DebugLoc(), 
TII->get(AMDGPU::S_WAIT_STORECNT)).addImm(0);
+  }
----------------
Pierre-vh wrote:


can drop `{}`

https://github.com/llvm/llvm-project/pull/79236
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)

Reply via email to