[PATCH] D142393: [OpenMP] Add 'amdgpu-flat-work-group-size' to OpenMP kernels

Joseph Huber via Phabricator via cfe-commits Mon, 23 Jan 2023 11:26:12 -0800

jhuber6 created this revision.
jhuber6 added reviewers: JonChesterfield, arsenm, tra, yaxunl, jdoerfert.
Herald added subscribers: kosarev, kerbowa, guansong, tpr, dstuttard, jvesely, 
kzhuravl.
Herald added a project: All.
jhuber6 requested review of this revision.
Herald added subscribers: cfe-commits, sstefan1, MaskRay, wdng.
Herald added a project: clang.


This patch adds the `amdgpu-flat-work-group-size=1,1024` attribute to
OpenMP kernels targeting AMDGPU. This also lets us use
`--gpu-max-threads-per-block` which is loosened from being a HIP only
option.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D142393

Files:
  clang/include/clang/Basic/LangOptions.def
  clang/include/clang/Driver/Options.td
  clang/lib/CodeGen/TargetInfo.cpp
  clang/lib/Driver/ToolChains/AMDGPU.cpp
  clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
  clang/lib/Driver/ToolChains/HIPAMD.cpp
  clang/lib/Frontend/CompilerInvocation.cpp
  clang/test/Driver/openmp-offload-gpu.c
  clang/test/OpenMP/amdgcn-attributes.cpp

Index: clang/test/OpenMP/amdgcn-attributes.cpp
===================================================================
--- clang/test/OpenMP/amdgcn-attributes.cpp
+++ clang/test/OpenMP/amdgcn-attributes.cpp
@@ -2,6 +2,7 @@
 
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefixes=DEFAULT,ALL %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s --gpu-max-threads-per-block=512 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefixes=MAX-THREADS %s
 // RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefixes=CPU,ALL %s
 
 // RUN: %clang_cc1 -menable-no-nans -mno-amdgpu-ieee -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefixes=NOIEEE,ALL %s
@@ -32,12 +33,14 @@
   return x + 1;
 }
 
-// DEFAULT: attributes #0 = { convergent noinline norecurse nounwind optnone "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
-// CPU: attributes #0 = { convergent noinline norecurse nounwind optnone "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
-// NOIEEE: attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-ieee"="false" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
-// UNSAFEATOMIC: attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-unsafe-fp-atomics"="true" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// DEFAULT: attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// MAX-THREADS: attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,512" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// CPU: attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
+// NOIEEE: attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-ieee"="false" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// UNSAFEATOMIC: attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-unsafe-fp-atomics"="true" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
 
 // DEFAULT: attributes #1 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// MAX-THREADS: attributes #1 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 // CPU: attributes #1 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
 // NOIEEE: attributes #1 = { convergent mustprogress noinline nounwind optnone "amdgpu-ieee"="false" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 // UNSAFEATOMIC: attributes #1 = { convergent mustprogress noinline nounwind optnone "amdgpu-unsafe-fp-atomics"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
Index: clang/test/Driver/openmp-offload-gpu.c
===================================================================
--- clang/test/Driver/openmp-offload-gpu.c
+++ clang/test/Driver/openmp-offload-gpu.c
@@ -373,3 +373,13 @@
 // RUN:   | FileCheck --check-prefix=XARCH-DEVICE %s
 // XARCH-DEVICE: "-cc1" "-triple" "nvptx64-nvidia-cuda"{{.*}}"-O3"
 // XARCH-DEVICE-NOT: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-O3"
+
+//
+// Check that  `--gpu-max-threads-per-block` works for AMDGPU OpenMP offloading.
+//
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \
+// RUN:     -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx1030 \
+// RUN:     -nogpulib --gpu-max-threads-per-block=512 %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=AMD-MAX-THREADS %s
+// AMD-MAX-THREADS: "-cc1" {{.*}} "--gpu-max-threads-per-block=512"
+// AMD-MAX-THREADS-SAME: "-fopenmp-is-device"
Index: clang/lib/Frontend/CompilerInvocation.cpp
===================================================================
--- clang/lib/Frontend/CompilerInvocation.cpp
+++ clang/lib/Frontend/CompilerInvocation.cpp
@@ -518,7 +518,8 @@
     Diags.Report(diag::warn_ignored_hip_only_option)
         << Args.getLastArg(OPT_fgpu_allow_device_init)->getAsString(Args);
 
-  if (Args.hasArg(OPT_gpu_max_threads_per_block_EQ) && !LangOpts.HIP)
+  if (Args.hasArg(OPT_gpu_max_threads_per_block_EQ) && !LangOpts.HIP &&
+      !LangOpts.OpenMPIsDevice)
     Diags.Report(diag::warn_ignored_hip_only_option)
         << Args.getLastArg(OPT_gpu_max_threads_per_block_EQ)->getAsString(Args);
 
Index: clang/lib/Driver/ToolChains/HIPAMD.cpp
===================================================================
--- clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -233,13 +233,11 @@
                           false))
     CC1Args.append({"-mllvm", "-amdgpu-internalize-symbols"});
 
-  StringRef MaxThreadsPerBlock =
-      DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ);
-  if (!MaxThreadsPerBlock.empty()) {
-    std::string ArgStr =
-        (Twine("--gpu-max-threads-per-block=") + MaxThreadsPerBlock).str();
-    CC1Args.push_back(DriverArgs.MakeArgStringRef(ArgStr));
-  }
+  if (DriverArgs.hasArg(options::OPT_gpu_max_threads_per_block_EQ))
+    CC1Args.push_back(DriverArgs.MakeArgString(
+        "--gpu-max-threads-per-block=" +
+        DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ,
+                                   "1024")));
 
   CC1Args.push_back("-fcuda-allow-variadic-functions");
 
Index: clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
===================================================================
--- clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -54,6 +54,12 @@
   CC1Args.push_back(DriverArgs.MakeArgStringRef(GPUArch));
   CC1Args.push_back("-fcuda-is-device");
 
+  if (DriverArgs.hasArg(options::OPT_gpu_max_threads_per_block_EQ))
+    CC1Args.push_back(DriverArgs.MakeArgString(
+        "--gpu-max-threads-per-block=" +
+        DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ,
+                                   "1024")));
+
   if (DriverArgs.hasArg(options::OPT_nogpulib))
     return;
 
Index: clang/lib/Driver/ToolChains/AMDGPU.cpp
===================================================================
--- clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -788,7 +788,7 @@
   if (DriverArgs.hasArg(options::OPT_nogpulib))
     return;
 
-  // Get the device name and canonicalize it
+  // Get the ndevice name and canonicalize it
   const StringRef GpuArch = getGPUArch(DriverArgs);
   auto Kind = llvm::AMDGPU::parseArchAMDGCN(GpuArch);
   const StringRef CanonArch = llvm::AMDGPU::getArchNameAMDGCN(Kind);
Index: clang/lib/CodeGen/TargetInfo.cpp
===================================================================
--- clang/lib/CodeGen/TargetInfo.cpp
+++ clang/lib/CodeGen/TargetInfo.cpp
@@ -9549,6 +9549,10 @@
   // TODO: This should be moved to language specific attributes instead.
   if (IsHIPKernel || IsOpenMPkernel)
     F->addFnAttr("uniform-work-group-size", "true");
+  if (IsOpenMPkernel)
+    F->addFnAttr("amdgpu-flat-work-group-size",
+                 std::string("1,") +
+                     llvm::utostr(M.getLangOpts().GPUMaxThreadsPerBlock));
 
   if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
     F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -1047,9 +1047,8 @@
   BothFlags<[HelpHidden], " in overloading resolution for CUDA/HIP">>;
 def gpu_max_threads_per_block_EQ : Joined<["--"], "gpu-max-threads-per-block=">,
   Flags<[CC1Option]>,
-  HelpText<"Default max threads per block for kernel launch bounds for HIP">,
-  MarshallingInfoInt<LangOpts<"GPUMaxThreadsPerBlock">, "1024">,
-  ShouldParseIf<hip.KeyPath>;
+  HelpText<"Default max threads per block for kernel launch bounds for OpenMP/HIP">,
+  MarshallingInfoInt<LangOpts<"GPUMaxThreadsPerBlock">, "1024">;
 def fgpu_inline_threshold_EQ : Joined<["-"], "fgpu-inline-threshold=">,
   Flags<[HelpHidden]>,
   HelpText<"Inline threshold for device compilation for CUDA/HIP">;
Index: clang/include/clang/Basic/LangOptions.def
===================================================================
--- clang/include/clang/Basic/LangOptions.def
+++ clang/include/clang/Basic/LangOptions.def
@@ -268,7 +268,7 @@
 LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
 LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
 LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions for HIP")
-LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kernel launch bounds for HIP")
+LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kernel launch bounds for OpenMP/HIP")
 LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP")
 LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP")
 LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.")

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D142393: [OpenMP] Add 'amdgpu-flat-work-group-size' to OpenMP kernels

Reply via email to