jhuber6 created this revision. jhuber6 added reviewers: tianshilei1992, JonChesterfield, jdoerfert, ronlieb, carlo.bertolli, gregrodgers. Herald added subscribers: guansong, yaxunl. Herald added a project: All. jhuber6 requested review of this revision. Herald added subscribers: openmp-commits, cfe-commits, sstefan1, MaskRay. Herald added projects: clang, OpenMP.
The OpenMP device runtime needs to support the OpenMP standard. However constructs like nested parallelism are very uncommon in real application yet lead to complexity in the runtime that is sometimes difficult to optimize out. As a stop-gap for performance we should supply an argument that selectively disables this feature. This patch adds the `-fopenmp-assume-no-nested-parallelism` argument which explicitly disables the usee of nested parallelism in OpenMP. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D132074 Files: clang/include/clang/Basic/LangOptions.def clang/include/clang/Driver/Options.td clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp clang/lib/Driver/ToolChains/Clang.cpp clang/test/OpenMP/target_globals_codegen.cpp openmp/libomptarget/DeviceRTL/include/Configuration.h openmp/libomptarget/DeviceRTL/src/Configuration.cpp openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
Index: openmp/libomptarget/DeviceRTL/src/Parallelism.cpp =================================================================== --- openmp/libomptarget/DeviceRTL/src/Parallelism.cpp +++ openmp/libomptarget/DeviceRTL/src/Parallelism.cpp @@ -86,11 +86,16 @@ uint32_t TId = mapping::getThreadIdInBlock(); + // Assert the parallelism level is zero if disabled by the user. + if (!config::mayUseNestedParallelism()) + ASSERT(icv::Level == 0 && "nested parallelism while disabled"); + // Handle the serialized case first, same for SPMD/non-SPMD: // 1) if-clause(0) - // 2) nested parallel regions - // 3) parallel in task or other thread state inducing construct - if (OMP_UNLIKELY(!if_expr || icv::Level || state::HasThreadState)) { + // 2) parallel in task or other thread state inducing construct + // 3) nested parallel regions + if (OMP_UNLIKELY(!if_expr || state::HasThreadState || + (icv::Level && config::mayUseNestedParallelism()))) { state::DateEnvironmentRAII DERAII(ident); ++icv::Level; invokeMicrotask(TId, 0, fn, args, nargs); Index: openmp/libomptarget/DeviceRTL/src/Configuration.cpp =================================================================== --- openmp/libomptarget/DeviceRTL/src/Configuration.cpp +++ openmp/libomptarget/DeviceRTL/src/Configuration.cpp @@ -23,6 +23,7 @@ // defined by CGOpenMPRuntimeGPU extern uint32_t __omp_rtl_debug_kind; extern uint32_t __omp_rtl_assume_no_thread_state; +extern uint32_t __omp_rtl_assume_no_nested_parallelism; // TODO: We want to change the name as soon as the old runtime is gone. // This variable should be visibile to the plugin so we override the default @@ -52,4 +53,8 @@ bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; } +bool config::mayUseNestedParallelism() { + return !__omp_rtl_assume_no_nested_parallelism; +} + #pragma omp end declare target Index: openmp/libomptarget/DeviceRTL/include/Configuration.h =================================================================== --- openmp/libomptarget/DeviceRTL/include/Configuration.h +++ openmp/libomptarget/DeviceRTL/include/Configuration.h @@ -44,6 +44,10 @@ /// explicitly disabled by the user. bool mayUseThreadStates(); +/// Indicates if this kernel may require data environments for nested +/// parallelism, or if it was explicitly disabled by the user. +bool mayUseNestedParallelism(); + } // namespace config } // namespace _OMP Index: clang/test/OpenMP/target_globals_codegen.cpp =================================================================== --- clang/test/OpenMP/target_globals_codegen.cpp +++ clang/test/OpenMP/target_globals_codegen.cpp @@ -7,6 +7,7 @@ // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-no-thread-state -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-STATE +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-no-nested-parallelism -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-NESTED // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -nogpulib -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-RUNTIME // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME // expected-no-diagnostics @@ -19,36 +20,49 @@ // CHECK: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 // CHECK: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 //. // CHECK-EQ: @__omp_rtl_debug_kind = weak_odr hidden constant i32 111 // CHECK-EQ: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK-EQ: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 // CHECK-EQ: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK-EQ: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 //. // CHECK-DEFAULT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-DEFAULT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK-DEFAULT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 // CHECK-DEFAULT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK-DEFAULT: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 //. // CHECK-THREADS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-THREADS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK-THREADS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 1 // CHECK-THREADS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK-THREADS: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 //. // CHECK-TEAMS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-TEAMS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1 // CHECK-TEAMS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 // CHECK-TEAMS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK-TEAMS: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 //. // CHECK-STATE: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-STATE: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK-STATE: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 // CHECK-STATE: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 1 +// CHECK-STATE: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 +//. +// CHECK-NESTED: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 +// CHECK-NESTED: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 +// CHECK-NESTED: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 +// CHECK-NESTED: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK-NESTED: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 1 //. // CHECK-RUNTIME-NOT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK-RUNTIME-NOT: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 //. void foo() { #pragma omp target Index: clang/lib/Driver/ToolChains/Clang.cpp =================================================================== --- clang/lib/Driver/ToolChains/Clang.cpp +++ clang/lib/Driver/ToolChains/Clang.cpp @@ -6132,6 +6132,8 @@ CmdArgs.push_back("-fopenmp-assume-threads-oversubscription"); if (Args.hasArg(options::OPT_fopenmp_assume_no_thread_state)) CmdArgs.push_back("-fopenmp-assume-no-thread-state"); + if (Args.hasArg(options::OPT_fopenmp_assume_no_nested_parallelism)) + CmdArgs.push_back("-fopenmp-assume-no-nested-parallelism"); if (Args.hasArg(options::OPT_fopenmp_offload_mandatory)) CmdArgs.push_back("-fopenmp-offload-mandatory"); break; @@ -8417,29 +8419,6 @@ } } - // Get the AMDGPU math libraries. - // FIXME: This method is bad, remove once AMDGPU has a proper math library - // (see AMDGCN::OpenMPLinker::constructLLVMLinkCommand). - for (auto &I : llvm::make_range(OpenMPTCRange.first, OpenMPTCRange.second)) { - const ToolChain *TC = I.second; - - if (!TC->getTriple().isAMDGPU() || Args.hasArg(options::OPT_nogpulib)) - continue; - - const ArgList &TCArgs = C.getArgsForToolChain(TC, "", Action::OFK_OpenMP); - StringRef Arch = TCArgs.getLastArgValue(options::OPT_march_EQ); - const toolchains::ROCMToolChain RocmTC(TC->getDriver(), TC->getTriple(), - TCArgs); - - SmallVector<std::string, 12> BCLibs = - RocmTC.getCommonDeviceLibNames(TCArgs, Arch.str()); - - for (StringRef LibName : BCLibs) - CmdArgs.push_back(Args.MakeArgString( - "--bitcode-library=" + Action::GetOffloadKindName(Action::OFK_OpenMP) + - "-" + TC->getTripleString() + "-" + Arch + "=" + LibName)); - } - if (D.isUsingLTO(/* IsOffload */ true)) { // Pass in the optimization level to use for LTO. if (const Arg *A = Args.getLastArg(options::OPT_O_Group)) { Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp =================================================================== --- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1213,6 +1213,8 @@ "__omp_rtl_assume_threads_oversubscription"); OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState, "__omp_rtl_assume_no_thread_state"); + OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism, + "__omp_rtl_assume_no_nested_parallelism"); } void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF, Index: clang/include/clang/Driver/Options.td =================================================================== --- clang/include/clang/Driver/Options.td +++ clang/include/clang/Driver/Options.td @@ -2580,6 +2580,10 @@ Flags<[CC1Option, NoArgumentUnused, HelpHidden]>, HelpText<"Assert no thread in a parallel region modifies an ICV">, MarshallingInfoFlag<LangOpts<"OpenMPNoThreadState">>; +def fopenmp_assume_no_nested_parallelism : Flag<["-"], "fopenmp-assume-no-nested-parallelism">, Group<f_Group>, + Flags<[CC1Option, NoArgumentUnused, HelpHidden]>, + HelpText<"Assert no nested parallel regions in the GPU">, + MarshallingInfoFlag<LangOpts<"OpenMPNoNestedParallelism">>; def fopenmp_offload_mandatory : Flag<["-"], "fopenmp-offload-mandatory">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>, HelpText<"Do not create a host fallback if offloading to the device fails.">, Index: clang/include/clang/Basic/LangOptions.def =================================================================== --- clang/include/clang/Basic/LangOptions.def +++ clang/include/clang/Basic/LangOptions.def @@ -253,6 +253,7 @@ LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.") LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.") LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.") +LANGOPT(OpenMPNoNestedParallelism , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region") LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.") LANGOPT(NoGPULib , 1, 0, "Indicate a build without the standard GPU libraries.") LANGOPT(RenderScript , 1, 0, "RenderScript")
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits