This revision was automatically updated to reflect the committed changes.
Closed by commit rG9f2d8b5c0cdb: [HIP] Add option --gpu-max-threads-per-block=n
(authored by yaxunl).
Herald added a project: clang.
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D71221/new/
https://reviews.llvm.org/D71221
Files:
clang/include/clang/Basic/LangOptions.def
clang/include/clang/Driver/Options.td
clang/lib/CodeGen/TargetInfo.cpp
clang/lib/Driver/ToolChains/HIP.cpp
clang/lib/Frontend/CompilerInvocation.cpp
clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
clang/test/Driver/hip-options.hip
Index: clang/test/Driver/hip-options.hip
===================================================================
--- /dev/null
+++ clang/test/Driver/hip-options.hip
@@ -0,0 +1,10 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang -### -x hip --gpu-max-threads-per-block=1024 %s 2>&1 | FileCheck %s
+
+// Check that there are commands for both host- and device-side compilations.
+//
+// CHECK: clang{{.*}}" "-cc1" {{.*}} "-fcuda-is-device"
+// CHECK-SAME: "--gpu-max-threads-per-block=1024"
Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
===================================================================
--- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -1,13 +1,21 @@
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
-// RUN: -fcuda-is-device -emit-llvm -o - %s | FileCheck %s
+// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
+// RUN: | FileCheck -check-prefixes=CHECK,DEFAULT %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa --gpu-max-threads-per-block=1024 \
+// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
+// RUN: | FileCheck -check-prefixes=CHECK,MAX1024 %s
// RUN: %clang_cc1 -triple nvptx \
// RUN: -fcuda-is-device -emit-llvm -o - %s | FileCheck %s \
// RUN: -check-prefix=NAMD
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \
-// RUN: -verify -o - %s | FileCheck -check-prefix=NAMD %s
+// RUN: -verify -o - -x hip %s | FileCheck -check-prefix=NAMD %s
#include "Inputs/cuda.h"
+__global__ void flat_work_group_size_default() {
+// CHECK: define amdgpu_kernel void @_Z28flat_work_group_size_defaultv() [[FLAT_WORK_GROUP_SIZE_DEFAULT:#[0-9]+]]
+}
+
__attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics
__global__ void flat_work_group_size_32_64() {
// CHECK: define amdgpu_kernel void @_Z26flat_work_group_size_32_64v() [[FLAT_WORK_GROUP_SIZE_32_64:#[0-9]+]]
@@ -31,7 +39,9 @@
// NAMD-NOT: "amdgpu-num-vgpr"
// NAMD-NOT: "amdgpu-num-sgpr"
-// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = { convergent noinline nounwind optnone "amdgpu-flat-work-group-size"="32,64"
-// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = { convergent noinline nounwind optnone "amdgpu-waves-per-eu"="2"
-// CHECK-DAG: attributes [[NUM_SGPR_32]] = { convergent noinline nounwind optnone "amdgpu-num-sgpr"="32"
-// CHECK-DAG: attributes [[NUM_VGPR_64]] = { convergent noinline nounwind optnone "amdgpu-num-vgpr"="64"
+// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"
+// MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
+// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64"
+// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
+// CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32"
+// CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64"
Index: clang/lib/Frontend/CompilerInvocation.cpp
===================================================================
--- clang/lib/Frontend/CompilerInvocation.cpp
+++ clang/lib/Frontend/CompilerInvocation.cpp
@@ -2559,6 +2559,12 @@
<< Args.getLastArg(OPT_fgpu_allow_device_init)->getAsString(Args);
}
Opts.HIPUseNewLaunchAPI = Args.hasArg(OPT_fhip_new_launch_api);
+ if (Opts.HIP)
+ Opts.GPUMaxThreadsPerBlock = getLastArgIntValue(
+ Args, OPT_gpu_max_threads_per_block_EQ, Opts.GPUMaxThreadsPerBlock);
+ else if (Args.hasArg(OPT_gpu_max_threads_per_block_EQ))
+ Diags.Report(diag::warn_ignored_hip_only_option)
+ << Args.getLastArg(OPT_gpu_max_threads_per_block_EQ)->getAsString(Args);
if (Opts.ObjC) {
if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
Index: clang/lib/Driver/ToolChains/HIP.cpp
===================================================================
--- clang/lib/Driver/ToolChains/HIP.cpp
+++ clang/lib/Driver/ToolChains/HIP.cpp
@@ -307,6 +307,14 @@
false))
CC1Args.push_back("-fgpu-rdc");
+ StringRef MaxThreadsPerBlock =
+ DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ);
+ if (!MaxThreadsPerBlock.empty()) {
+ std::string ArgStr =
+ std::string("--gpu-max-threads-per-block=") + MaxThreadsPerBlock.str();
+ CC1Args.push_back(DriverArgs.MakeArgStringRef(ArgStr));
+ }
+
if (DriverArgs.hasFlag(options::OPT_fgpu_allow_device_init,
options::OPT_fno_gpu_allow_device_init, false))
CC1Args.push_back("-fgpu-allow-device-init");
Index: clang/lib/CodeGen/TargetInfo.cpp
===================================================================
--- clang/lib/CodeGen/TargetInfo.cpp
+++ clang/lib/CodeGen/TargetInfo.cpp
@@ -8072,8 +8072,11 @@
} else
assert(Max == 0 && "Max must be zero");
} else if (IsOpenCLKernel || IsHIPKernel) {
- // By default, restrict the maximum size to 256.
- F->addFnAttr("amdgpu-flat-work-group-size", "1,256");
+ // By default, restrict the maximum size to a value specified by
+ // --gpu-max-threads-per-block=n or its default value.
+ std::string AttrVal =
+ std::string("1,") + llvm::utostr(M.getLangOpts().GPUMaxThreadsPerBlock);
+ F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
}
if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>()) {
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -606,6 +606,9 @@
def fgpu_allow_device_init : Flag<["-"], "fgpu-allow-device-init">,
Flags<[CC1Option]>, HelpText<"Allow device side init function in HIP">;
def fno_gpu_allow_device_init : Flag<["-"], "fno-gpu-allow-device-init">;
+def gpu_max_threads_per_block_EQ : Joined<["--"], "gpu-max-threads-per-block=">,
+ Flags<[CC1Option]>,
+ HelpText<"Default max threads per block for kernel launch bounds for HIP">;
def libomptarget_nvptx_path_EQ : Joined<["--"], "libomptarget-nvptx-path=">, Group<i_Group>,
HelpText<"Path to libomptarget-nvptx libraries">;
def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
Index: clang/include/clang/Basic/LangOptions.def
===================================================================
--- clang/include/clang/Basic/LangOptions.def
+++ clang/include/clang/Basic/LangOptions.def
@@ -227,6 +227,7 @@
LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions for HIP")
+LANGOPT(GPUMaxThreadsPerBlock, 32, 256, "default max threads per block for kernel launch bounds for HIP")
LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device")
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits