Author: Joseph Huber Date: 2022-08-23T14:09:51-05:00 New Revision: 2b8f722e630d0fdf1ca267361866a27c8d4c9387
URL: https://github.com/llvm/llvm-project/commit/2b8f722e630d0fdf1ca267361866a27c8d4c9387 DIFF: https://github.com/llvm/llvm-project/commit/2b8f722e630d0fdf1ca267361866a27c8d4c9387.diff LOG: [OpenMP] Add option to assert no nested OpenMP parallelism on the GPU The OpenMP device runtime needs to support the OpenMP standard. However constructs like nested parallelism are very uncommon in real application yet lead to complexity in the runtime that is sometimes difficult to optimize out. As a stop-gap for performance we should supply an argument that selectively disables this feature. This patch adds the `-fopenmp-assume-no-nested-parallelism` argument which explicitly disables the usee of nested parallelism in OpenMP. Reviewed By: carlo.bertolli Differential Revision: https://reviews.llvm.org/D132074 Added: Modified: clang/include/clang/Basic/LangOptions.def clang/include/clang/Driver/Options.td clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp clang/lib/Driver/ToolChains/Clang.cpp clang/test/OpenMP/target_globals_codegen.cpp openmp/libomptarget/DeviceRTL/include/Configuration.h openmp/libomptarget/DeviceRTL/src/Configuration.cpp openmp/libomptarget/DeviceRTL/src/Parallelism.cpp Removed: ################################################################################ diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 4f3f6fc9da8c0..530f82268eb09 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -254,6 +254,7 @@ LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.") LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.") LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.") +LANGOPT(OpenMPNoNestedParallelism , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region") LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.") LANGOPT(NoGPULib , 1, 0, "Indicate a build without the standard GPU libraries.") LANGOPT(RenderScript , 1, 0, "RenderScript") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 0666768c7e74b..76aed6a7c928f 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2585,6 +2585,10 @@ def fopenmp_assume_no_thread_state : Flag<["-"], "fopenmp-assume-no-thread-state Flags<[CC1Option, NoArgumentUnused, HelpHidden]>, HelpText<"Assert no thread in a parallel region modifies an ICV">, MarshallingInfoFlag<LangOpts<"OpenMPNoThreadState">>; +def fopenmp_assume_no_nested_parallelism : Flag<["-"], "fopenmp-assume-no-nested-parallelism">, Group<f_Group>, + Flags<[CC1Option, NoArgumentUnused, HelpHidden]>, + HelpText<"Assert no nested parallel regions in the GPU">, + MarshallingInfoFlag<LangOpts<"OpenMPNoNestedParallelism">>; def fopenmp_offload_mandatory : Flag<["-"], "fopenmp-offload-mandatory">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>, HelpText<"Do not create a host fallback if offloading to the device fails.">, diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index e8affaaecc980..54e9c4d844a3c 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1213,6 +1213,8 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM) "__omp_rtl_assume_threads_oversubscription"); OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState, "__omp_rtl_assume_no_thread_state"); + OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism, + "__omp_rtl_assume_no_nested_parallelism"); } void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF, diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index b867adc553033..34bf7a797e7e8 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6128,6 +6128,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-fopenmp-assume-threads-oversubscription"); if (Args.hasArg(options::OPT_fopenmp_assume_no_thread_state)) CmdArgs.push_back("-fopenmp-assume-no-thread-state"); + if (Args.hasArg(options::OPT_fopenmp_assume_no_nested_parallelism)) + CmdArgs.push_back("-fopenmp-assume-no-nested-parallelism"); if (Args.hasArg(options::OPT_fopenmp_offload_mandatory)) CmdArgs.push_back("-fopenmp-offload-mandatory"); break; @@ -8426,8 +8428,9 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, for (StringRef LibName : BCLibs) CmdArgs.push_back(Args.MakeArgString( - "--bitcode-library=" + Action::GetOffloadKindName(Action::OFK_OpenMP) + - "-" + TC->getTripleString() + "-" + Arch + "=" + LibName)); + "--bitcode-library=" + + Action::GetOffloadKindName(Action::OFK_OpenMP) + "-" + + TC->getTripleString() + "-" + Arch + "=" + LibName)); } if (D.isUsingLTO(/* IsOffload */ true)) { diff --git a/clang/test/OpenMP/target_globals_codegen.cpp b/clang/test/OpenMP/target_globals_codegen.cpp index ad0c097e09103..45baa7b5d0445 100644 --- a/clang/test/OpenMP/target_globals_codegen.cpp +++ b/clang/test/OpenMP/target_globals_codegen.cpp @@ -7,6 +7,7 @@ // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-no-thread-state -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-STATE +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-no-nested-parallelism -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-NESTED // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -nogpulib -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-RUNTIME // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME // expected-no-diagnostics @@ -19,36 +20,49 @@ // CHECK: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 // CHECK: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 //. // CHECK-EQ: @__omp_rtl_debug_kind = weak_odr hidden constant i32 111 // CHECK-EQ: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK-EQ: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 // CHECK-EQ: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK-EQ: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 //. // CHECK-DEFAULT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-DEFAULT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK-DEFAULT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 // CHECK-DEFAULT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK-DEFAULT: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 //. // CHECK-THREADS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-THREADS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK-THREADS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 1 // CHECK-THREADS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK-THREADS: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 //. // CHECK-TEAMS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-TEAMS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1 // CHECK-TEAMS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 // CHECK-TEAMS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK-TEAMS: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 //. // CHECK-STATE: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-STATE: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK-STATE: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 // CHECK-STATE: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 1 +// CHECK-STATE: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 +//. +// CHECK-NESTED: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 +// CHECK-NESTED: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 +// CHECK-NESTED: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 +// CHECK-NESTED: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK-NESTED: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 1 //. // CHECK-RUNTIME-NOT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK-RUNTIME-NOT: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 //. void foo() { #pragma omp target diff --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h index 368a7c35ac4a8..72514b82896f9 100644 --- a/openmp/libomptarget/DeviceRTL/include/Configuration.h +++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h @@ -44,6 +44,10 @@ bool isDebugMode(DebugKind Level); /// explicitly disabled by the user. bool mayUseThreadStates(); +/// Indicates if this kernel may require data environments for nested +/// parallelism, or if it was explicitly disabled by the user. +bool mayUseNestedParallelism(); + } // namespace config } // namespace _OMP diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp index b3d779a96361f..a41574cfc21d6 100644 --- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp @@ -23,6 +23,7 @@ using namespace _OMP; // defined by CGOpenMPRuntimeGPU extern uint32_t __omp_rtl_debug_kind; extern uint32_t __omp_rtl_assume_no_thread_state; +extern uint32_t __omp_rtl_assume_no_nested_parallelism; // TODO: We want to change the name as soon as the old runtime is gone. // This variable should be visibile to the plugin so we override the default @@ -52,4 +53,8 @@ bool config::isDebugMode(config::DebugKind Kind) { bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; } +bool config::mayUseNestedParallelism() { + return !__omp_rtl_assume_no_nested_parallelism; +} + #pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp index 27d1ff2e5a55c..5ebf3687ac43f 100644 --- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp @@ -86,11 +86,16 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, uint32_t TId = mapping::getThreadIdInBlock(); + // Assert the parallelism level is zero if disabled by the user. + ASSERT((config::mayUseNestedParallelism() || icv::Level == 0) && + "nested parallelism while disabled"); + // Handle the serialized case first, same for SPMD/non-SPMD: // 1) if-clause(0) - // 2) nested parallel regions - // 3) parallel in task or other thread state inducing construct - if (OMP_UNLIKELY(!if_expr || icv::Level || state::HasThreadState)) { + // 2) parallel in task or other thread state inducing construct + // 3) nested parallel regions + if (OMP_UNLIKELY(!if_expr || state::HasThreadState || + (config::mayUseNestedParallelism() && icv::Level))) { state::DateEnvironmentRAII DERAII(ident); ++icv::Level; invokeMicrotask(TId, 0, fn, args, nargs); _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits