[llvm-branch-commits] [clang] 4909cb1 - [OpenMP][AMDGPU] Use AMDGPU_KERNEL calling convention for entry function

2021-01-05 Thread Pushpinder Singh via llvm-branch-commits

Author: Pushpinder Singh
Date: 2021-01-06T02:03:30-05:00
New Revision: 4909cb1a0fe9f2494ccbadc2856b6ddfc70051b5

URL: 
https://github.com/llvm/llvm-project/commit/4909cb1a0fe9f2494ccbadc2856b6ddfc70051b5
DIFF: 
https://github.com/llvm/llvm-project/commit/4909cb1a0fe9f2494ccbadc2856b6ddfc70051b5.diff

LOG: [OpenMP][AMDGPU] Use AMDGPU_KERNEL calling convention for entry function

AMDGPU backend requires entry functions/kernels to have AMDGPU_KERNEL
calling convention for proper linking.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D94060

Added: 


Modified: 
clang/lib/CodeGen/CGOpenMPRuntime.cpp
clang/test/OpenMP/amdgcn_target_codegen.cpp

Removed: 




diff  --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp 
b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index c15f6350b95e..a3b24039365b 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -6471,6 +6471,8 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
 OutlinedFnID = llvm::ConstantExpr::getBitCast(OutlinedFn, CGM.Int8PtrTy);
 OutlinedFn->setLinkage(llvm::GlobalValue::WeakAnyLinkage);
 OutlinedFn->setDSOLocal(false);
+if (CGM.getTriple().isAMDGCN())
+  OutlinedFn->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
   } else {
 std::string Name = getName({EntryFnName, "region_id"});
 OutlinedFnID = new llvm::GlobalVariable(

diff  --git a/clang/test/OpenMP/amdgcn_target_codegen.cpp 
b/clang/test/OpenMP/amdgcn_target_codegen.cpp
index 416ed06083b0..701211d449ca 100644
--- a/clang/test/OpenMP/amdgcn_target_codegen.cpp
+++ b/clang/test/OpenMP/amdgcn_target_codegen.cpp
@@ -9,7 +9,7 @@
 #define N 1000
 
 int test_amdgcn_target_tid_threads() {
-// CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads
+// CHECK-LABEL: define weak amdgpu_kernel void 
@{{.*}}test_amdgcn_target_tid_threads
 
   int arr[N];
 
@@ -25,7 +25,7 @@ int test_amdgcn_target_tid_threads() {
 }
 
 int test_amdgcn_target_tid_threads_simd() {
-// CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads_simd
+// CHECK-LABEL: define weak amdgpu_kernel void 
@{{.*}}test_amdgcn_target_tid_threads_simd
 
   int arr[N];
 



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] e2303a4 - [FastRA] Fix handling of bundled MIs

2020-12-20 Thread Pushpinder Singh via llvm-branch-commits

Author: Pushpinder Singh
Date: 2020-12-21T02:10:55-05:00
New Revision: e2303a448e2fcc1d96d66e9ee9f0cfc009b69a3f

URL: 
https://github.com/llvm/llvm-project/commit/e2303a448e2fcc1d96d66e9ee9f0cfc009b69a3f
DIFF: 
https://github.com/llvm/llvm-project/commit/e2303a448e2fcc1d96d66e9ee9f0cfc009b69a3f.diff

LOG: [FastRA] Fix handling of bundled MIs

Fast register allocator skips bundled MIs, as the main assignment
loop uses MachineBasicBlock::iterator (= MachineInstrBundleIterator)
This was causing SIInsertWaitcnts to crash which expects all
instructions to have registers assigned.

This patch makes sure to set everything inside bundle to the same
assignments done on BUNDLE header.

Reviewed By: qcolombet

Differential Revision: https://reviews.llvm.org/D90369

Added: 
llvm/test/CodeGen/AMDGPU/fast-regalloc-bundles.mir
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll

Modified: 
llvm/lib/CodeGen/RegAllocFast.cpp

Removed: 




diff  --git a/llvm/lib/CodeGen/RegAllocFast.cpp 
b/llvm/lib/CodeGen/RegAllocFast.cpp
index 09c4674e4be6..d6c5e11fd0c5 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -105,6 +105,9 @@ namespace {
 /// available in a physical register.
 LiveRegMap LiveVirtRegs;
 
+/// Stores assigned virtual registers present in the bundle MI.
+DenseMap BundleVirtRegsMap;
+
 DenseMap> LiveDbgValueMap;
 /// List of DBG_VALUE that we encountered without the vreg being assigned
 /// because they were placed after the last use of the vreg.
@@ -218,6 +221,8 @@ namespace {
 
 void allocateInstruction(MachineInstr &MI);
 void handleDebugValue(MachineInstr &MI);
+void handleBundle(MachineInstr &MI);
+
 bool usePhysReg(MachineInstr &MI, MCPhysReg PhysReg);
 bool definePhysReg(MachineInstr &MI, MCPhysReg PhysReg);
 bool displacePhysReg(MachineInstr &MI, MCPhysReg PhysReg);
@@ -889,6 +894,9 @@ void RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned 
OpNum,
 LRI->LiveOut = false;
 LRI->Reloaded = false;
   }
+  if (MI.getOpcode() == TargetOpcode::BUNDLE) {
+BundleVirtRegsMap[VirtReg] = PhysReg;
+  }
   markRegUsedInInstr(PhysReg);
   setPhysReg(MI, MO, PhysReg);
 }
@@ -934,6 +942,10 @@ void RegAllocFast::useVirtReg(MachineInstr &MI, unsigned 
OpNum,
   }
 
   LRI->LastUse = &MI;
+
+  if (MI.getOpcode() == TargetOpcode::BUNDLE) {
+BundleVirtRegsMap[VirtReg] = LRI->PhysReg;
+  }
   markRegUsedInInstr(LRI->PhysReg);
   setPhysReg(MI, MO, LRI->PhysReg);
 }
@@ -1064,6 +1076,7 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
   //   operands and early-clobbers.
 
   UsedInInstr.clear();
+  BundleVirtRegsMap.clear();
 
   // Scan for special cases; Apply pre-assigned register defs to state.
   bool HasPhysRegUse = false;
@@ -1382,6 +1395,30 @@ void RegAllocFast::handleDebugValue(MachineInstr &MI) {
   LiveDbgValueMap[Reg].push_back(&MI);
 }
 
+void RegAllocFast::handleBundle(MachineInstr &MI) {
+  MachineBasicBlock::instr_iterator BundledMI = MI.getIterator();
+  ++BundledMI;
+  while (BundledMI->isBundledWithPred()) {
+for (unsigned I = 0; I < BundledMI->getNumOperands(); ++I) {
+  MachineOperand &MO = BundledMI->getOperand(I);
+  if (!MO.isReg())
+continue;
+
+  Register Reg = MO.getReg();
+  if (!Reg.isVirtual())
+continue;
+
+  DenseMap::iterator DI;
+  DI = BundleVirtRegsMap.find(Reg);
+  assert(DI != BundleVirtRegsMap.end() && "Unassigned virtual register");
+
+  setPhysReg(MI, MO, DI->second);
+}
+
+++BundledMI;
+  }
+}
+
 void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
   this->MBB = &MBB;
   LLVM_DEBUG(dbgs() << "\nAllocating " << MBB);
@@ -1411,6 +1448,12 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock 
&MBB) {
 }
 
 allocateInstruction(MI);
+
+// Once BUNDLE header is assigned registers, same assignments need to be
+// done for bundled MIs.
+if (MI.getOpcode() == TargetOpcode::BUNDLE) {
+  handleBundle(MI);
+}
   }
 
   LLVM_DEBUG(

diff  --git a/llvm/test/CodeGen/AMDGPU/fast-regalloc-bundles.mir 
b/llvm/test/CodeGen/AMDGPU/fast-regalloc-bundles.mir
new file mode 100644
index ..dde48a97f152
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fast-regalloc-bundles.mir
@@ -0,0 +1,26 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs 
-run-pass=regallocfast %s -o - | FileCheck -check-prefixes=GCN,XNACK,GCX9 %s
+
+---
+name: fast_regalloc_bundle_handling
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body: |
+  bb.0:
+; GCN-LABEL: name: fast_regalloc_bundle_handling
+; GCN: renamable $vgpr0 = IMPLICIT_DEF
+; GCN: renamable $vgpr1 = IMPLICIT_DEF
+; GCN: renamable $vgpr0 = BUNDL

[llvm-branch-commits] [openmp] afc09c6 - [libomptarget][AMDGPU] Remove MaxParallelLevel

2020-12-02 Thread Pushpinder Singh via llvm-branch-commits

Author: Pushpinder Singh
Date: 2020-12-03T00:27:03-05:00
New Revision: afc09c6fe44ecf99e5946b7fe08013f592504448

URL: 
https://github.com/llvm/llvm-project/commit/afc09c6fe44ecf99e5946b7fe08013f592504448
DIFF: 
https://github.com/llvm/llvm-project/commit/afc09c6fe44ecf99e5946b7fe08013f592504448.diff

LOG: [libomptarget][AMDGPU] Remove MaxParallelLevel

Removes MaxParallelLevel references from rtl.cpp and drops
resulting dead code.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D92463

Added: 


Modified: 
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 




diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp 
b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index dc3a288903f0..477439d19b50 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -183,17 +183,15 @@ struct KernelTy {
   // 1 - Generic mode (with master warp)
   int8_t ExecutionMode;
   int16_t ConstWGSize;
-  int8_t MaxParLevel;
   int32_t device_id;
   void *CallStackAddr;
   const char *Name;
 
-  KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int8_t _MaxParLevel,
-   int32_t _device_id, void *_CallStackAddr, const char *_Name,
+  KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int32_t _device_id,
+   void *_CallStackAddr, const char *_Name,
uint32_t _kernarg_segment_size)
   : ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize),
-MaxParLevel(_MaxParLevel), device_id(_device_id),
-CallStackAddr(_CallStackAddr), Name(_Name) {
+device_id(_device_id), CallStackAddr(_CallStackAddr), Name(_Name) {
 DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
 
 std::string N(_Name);
@@ -1140,9 +1138,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t 
device_id,
 // get flat group size if present, else Default_WG_Size
 int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
 
-// Max parallel level
-int16_t MaxParLevVal = 0;
-
 // get Kernel Descriptor if present.
 // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
 struct KernDescValType {
@@ -1151,7 +1146,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t 
device_id,
   uint16_t WG_Size;
   uint8_t Mode;
   uint8_t HostServices;
-  uint8_t MaxParallelLevel;
 };
 struct KernDescValType KernDescVal;
 std::string KernDescNameStr(e->name);
@@ -1183,31 +1177,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t 
device_id,
   DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size);
   DP("KernDesc: Mode: %d\n", KernDescVal.Mode);
   DP("KernDesc: HostServices: %x\n", KernDescVal.HostServices);
-  DP("KernDesc: MaxParallelLevel: %x\n", KernDescVal.MaxParallelLevel);
-
-  // gather location of callStack and size of struct
-  MaxParLevVal = KernDescVal.MaxParallelLevel;
-  if (MaxParLevVal > 0) {
-uint32_t varsize;
-const char *CsNam = "omptarget_nest_par_call_stack";
-err = atmi_interop_hsa_get_symbol_info(place, CsNam, &CallStackAddr,
-   &varsize);
-if (err != ATMI_STATUS_SUCCESS) {
-  fprintf(stderr, "Addr of %s failed\n", CsNam);
-  return NULL;
-}
-void *StructSizePtr;
-const char *SsNam = "omptarget_nest_par_call_struct_size";
-err = interop_get_symbol_info((char *)image->ImageStart, img_size,
-  SsNam, &StructSizePtr, &varsize);
-if ((err != ATMI_STATUS_SUCCESS) ||
-(varsize != sizeof(TgtStackItemSize))) {
-  fprintf(stderr, "Addr of %s failed\n", SsNam);
-  return NULL;
-}
-memcpy(&TgtStackItemSize, StructSizePtr, sizeof(TgtStackItemSize));
-DP("Size of our struct is %d\n", TgtStackItemSize);
-  }
 
   // Get ExecMode
   ExecModeVal = KernDescVal.Mode;
@@ -1298,8 +1267,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t 
device_id,
   check("Loading WGSize computation property", err);
 }
 
-KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, MaxParLevVal,
-   device_id, CallStackAddr, e->name,
+KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id,
+   CallStackAddr, e->name,
kernarg_segment_size));
 __tgt_offload_entry entry = *e;
 entry.addr = (void *)&KernelsList.back();
@@ -1518,34 +1487,6 @@ void getLaunchVals(int &threadsPerGroup, int 
&num_groups, int ConstWGSize,
  threadsPerGroup);
 }
 
-static void *AllocateNestedParallelCallMemory(int MaxParLevel, int NumGroups,
-  int ThreadsPerGroup,
-  int device_id,
-