[PATCH] D90251: [AMDGPU] Add __builtin_amdgcn_grid_size

Jon Chesterfield via Phabricator via cfe-commits Tue, 27 Oct 2020 10:25:28 -0700

JonChesterfield created this revision.
JonChesterfield added reviewers: yaxunl, arsenm, b-sumner, cfang, rjmccall, 
Anastasia.
Herald added subscribers: openmp-commits, cfe-commits, dexonsmith, kerbowa, 
t-tye, tpr, dstuttard, nhaehnle, jvesely, kzhuravl.
Herald added projects: clang, OpenMP.
JonChesterfield requested review of this revision.
Herald added subscribers: sstefan1, wdng.
Herald added a reviewer: jdoerfert.


[AMDGPU] Add __builtin_amdgcn_grid_size

Similar to D76772 <https://reviews.llvm.org/D76772>, loads the data from the 
dispatch pointer. Marked invariant.

Patch also updates the openmp devicertl to use this builtin.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D90251

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenOpenCL/builtins-amdgcn.cl
  openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip

Index: openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
===================================================================
--- openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -119,12 +119,6 @@
 }
 
 namespace {
-DEVICE uint32_t grid_size_x() {
-  size_t grid_size_x_offset = 96; // In bits, from AQL kernel dispatch format
-  return *(uint32_t *)((char *)__builtin_amdgcn_dispatch_ptr() +
-                       grid_size_x_offset / 8);
-}
-
 DEVICE uint32_t get_grid_dim(uint32_t n, uint16_t d) {
   uint32_t q = n / d;
   return q + (n > q * d);
@@ -137,11 +131,11 @@
 } // namespace
 
 DEVICE int GetNumberOfBlocksInKernel() {
-  return get_grid_dim(grid_size_x(), __builtin_amdgcn_workgroup_size_x());
+  return get_grid_dim(__builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x());
 }
 
 DEVICE int GetNumberOfThreadsInBlock() {
-  return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), grid_size_x(),
+  return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), __builtin_amdgcn_grid_size_x(),
                            __builtin_amdgcn_workgroup_size_x());
 }
 
Index: clang/test/CodeGenOpenCL/builtins-amdgcn.cl
===================================================================
--- clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -559,6 +559,24 @@
 	}
 }
 
+// CHECK-LABEL: @test_get_grid_size(
+// CHECK: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 12
+// CHECK: load i32, i32 addrspace(4)* %{{.*}}, align 4, !invariant.load
+// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 16
+// CHECK: load i32, i32 addrspace(4)* %{{.*}}, align 4, !invariant.load
+// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 20
+// CHECK: load i32, i32 addrspace(4)* %{{.*}}, align 4, !invariant.load
+void test_get_grid_size(int d, global int *out)
+{
+	switch (d) {
+	case 0: *out = __builtin_amdgcn_grid_size_x(); break;
+	case 1: *out = __builtin_amdgcn_grid_size_y(); break;
+	case 2: *out = __builtin_amdgcn_grid_size_z(); break;
+	default: *out = 0;
+	}
+}
+
 // CHECK-LABEL: @test_fmed3_f32
 // CHECK: call float @llvm.amdgcn.fmed3.f32(
 void test_fmed3_f32(global float* out, float a, float b, float c)
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -14750,6 +14750,22 @@
       llvm::MDNode::get(CGF.getLLVMContext(), None));
   return LD;
 }
+
+// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
+Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) {
+  const unsigned XOffset = 12;
+  auto *DP = EmitAMDGPUDispatchPtr(CGF);
+  // Indexing the HSA kernel_dispatch_packet struct.
+  auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 4);
+  auto *GEP = CGF.Builder.CreateGEP(DP, Offset);
+  auto *DstTy =
+      CGF.Int32Ty->getPointerTo(GEP->getType()->getPointerAddressSpace());
+  auto *Cast = CGF.Builder.CreateBitCast(GEP, DstTy);
+  auto *LD = CGF.Builder.CreateLoad(Address(Cast, CharUnits::fromQuantity(4)));
+  LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
+                  llvm::MDNode::get(CGF.getLLVMContext(), None));
+  return LD;
+}
 } // namespace
 
 // For processing memory ordering and memory scope arguments of various
@@ -15010,6 +15026,14 @@
   case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
     return EmitAMDGPUWorkGroupSize(*this, 2);
 
+  // amdgcn grid size
+  case AMDGPU::BI__builtin_amdgcn_grid_size_x:
+    return EmitAMDGPUGridSize(*this, 0);
+  case AMDGPU::BI__builtin_amdgcn_grid_size_y:
+    return EmitAMDGPUGridSize(*this, 1);
+  case AMDGPU::BI__builtin_amdgcn_grid_size_z:
+    return EmitAMDGPUGridSize(*this, 2);
+
   // r600 intrinsics
   case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
   case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
Index: clang/include/clang/Basic/BuiltinsAMDGPU.def
===================================================================
--- clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -37,6 +37,10 @@
 BUILTIN(__builtin_amdgcn_workgroup_size_y, "Us", "nc")
 BUILTIN(__builtin_amdgcn_workgroup_size_z, "Us", "nc")
 
+BUILTIN(__builtin_amdgcn_grid_size_x, "Ui", "nc")
+BUILTIN(__builtin_amdgcn_grid_size_y, "Ui", "nc")
+BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc")
+
 BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc")
 BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D90251: [AMDGPU] Add __builtin_amdgcn_grid_size

Reply via email to