Author: Joseph Huber Date: 2022-02-18T08:35:05-05:00 New Revision: 0870a4f59aef21bf7707b00ebd4dcad7ce7ef807
URL: https://github.com/llvm/llvm-project/commit/0870a4f59aef21bf7707b00ebd4dcad7ce7ef807 DIFF: https://github.com/llvm/llvm-project/commit/0870a4f59aef21bf7707b00ebd4dcad7ce7ef807.diff LOG: [OpenMP] Add flag for disabling thread state in runtime The runtime uses thread state values to indicate when we use an ICV or are in nested parallelism. This is done for OpenMP correctness, but it not needed in the majority of cases. The new flag added is `-fopenmp-assume-no-thread-state`. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D120106 Added: Modified: clang/include/clang/Basic/LangOptions.def clang/include/clang/Driver/Options.td clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp clang/lib/Driver/ToolChains/Clang.cpp clang/test/OpenMP/target_globals_codegen.cpp openmp/libomptarget/DeviceRTL/include/Configuration.h openmp/libomptarget/DeviceRTL/src/Configuration.cpp openmp/libomptarget/DeviceRTL/src/State.cpp Removed: ################################################################################ diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 4651f4fff6aa0..e21998860f217 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -246,6 +246,7 @@ LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading de LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.") LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.") LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.") +LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.") LANGOPT(RenderScript , 1, 0, "RenderScript") LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 76cfdbcd85f26..c377329e8f6f4 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2473,6 +2473,10 @@ def fno_openmp_assume_teams_oversubscription : Flag<["-"], "fno-openmp-assume-te Group<f_Group>, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; def fno_openmp_assume_threads_oversubscription : Flag<["-"], "fno-openmp-assume-threads-oversubscription">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; +def fopenmp_assume_no_thread_state : Flag<["-"], "fopenmp-assume-no-thread-state">, Group<f_Group>, + Flags<[CC1Option, NoArgumentUnused, HelpHidden]>, + HelpText<"Assert no thread in a parallel region modifies an ICV">, + MarshallingInfoFlag<LangOpts<"OpenMPNoThreadState">>; defm openmp_target_new_runtime: BoolFOption<"openmp-target-new-runtime", LangOpts<"OpenMPTargetNewRuntime">, DefaultTrue, PosFlag<SetTrue, [CC1Option], "Use the new bitcode library for OpenMP offloading">, diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index bb6847ab87319..fcaf9d4ed77b3 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1210,6 +1210,8 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM) "__omp_rtl_assume_teams_oversubscription"); OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription, "__omp_rtl_assume_threads_oversubscription"); + OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState, + "__omp_rtl_assume_no_thread_state"); } } diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index a16175ebebbca..32cbb7936f7ee 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5995,6 +5995,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_fno_openmp_assume_threads_oversubscription, /*Default=*/false)) CmdArgs.push_back("-fopenmp-assume-threads-oversubscription"); + if (Args.hasArg(options::OPT_fopenmp_assume_no_thread_state)) + CmdArgs.push_back("-fopenmp-assume-no-thread-state"); break; default: // By default, if Clang doesn't know how to generate useful OpenMP code diff --git a/clang/test/OpenMP/target_globals_codegen.cpp b/clang/test/OpenMP/target_globals_codegen.cpp index fa7569cd4ca6b..3c5d4b8ed3984 100644 --- a/clang/test/OpenMP/target_globals_codegen.cpp +++ b/clang/test/OpenMP/target_globals_codegen.cpp @@ -6,6 +6,7 @@ // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-DEFAULT // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-no-thread-state -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-STATE // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME // expected-no-diagnostics @@ -16,26 +17,37 @@ // CHECK: @__omp_rtl_debug_kind = weak_odr hidden constant i32 1 // CHECK: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 +// CHECK: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 //. // CHECK-EQ: @__omp_rtl_debug_kind = weak_odr hidden constant i32 111 // CHECK-EQ: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK-EQ: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 +// CHECK-EQ: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 //. // CHECK-DEFAULT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-DEFAULT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK-DEFAULT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 +// CHECK-DEFAULT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 //. // CHECK-THREADS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-THREADS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK-THREADS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 1 +// CHECK-THREADS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 //. // CHECK-TEAMS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-TEAMS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1 // CHECK-TEAMS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 +// CHECK-TEAMS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +//. +// CHECK-STATE: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 +// CHECK-STATE: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 +// CHECK-STATE: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 +// CHECK-STATE: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 1 //. // CHECK-RUNTIME-NOT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 +// CHECK-RUNTIME-NOT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 //. void foo() { #pragma omp target diff --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h index 5727f1f2bfbf6..94f11b6066a20 100644 --- a/openmp/libomptarget/DeviceRTL/include/Configuration.h +++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h @@ -38,8 +38,13 @@ uint32_t getDebugKind(); /// Return the amount of dynamic shared memory that was allocated at launch. uint64_t getDynamicMemorySize(); +/// Return if debugging is enabled for the given debug kind. bool isDebugMode(DebugKind Level); +/// Indicates if this kernel may require thread-specific states, or if it was +/// explicitly disabled by the user. +bool mayUseThreadStates(); + } // namespace config } // namespace _OMP diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp index 349f93a08701c..e9cc9bb0e318e 100644 --- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp @@ -20,7 +20,9 @@ using namespace _OMP; #pragma omp declare target -extern uint32_t __omp_rtl_debug_kind; // defined by CGOpenMPRuntimeGPU +// defined by CGOpenMPRuntimeGPU +extern uint32_t __omp_rtl_debug_kind; +extern uint32_t __omp_rtl_assume_no_thread_state; // TODO: We want to change the name as soon as the old runtime is gone. // This variable should be visibile to the plugin so we override the default @@ -48,4 +50,6 @@ bool config::isDebugMode(config::DebugKind Kind) { return config::getDebugKind() & Kind; } +bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; } + #pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp index a04f5cccb1738..a530c5e0b2471 100644 --- a/openmp/libomptarget/DeviceRTL/src/State.cpp +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -285,7 +285,8 @@ ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam]; #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc) uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var, IdentTy *Ident) { - if (OMP_LIKELY(TeamState.ICVState.LevelVar == 0)) + if (OMP_LIKELY(!config::mayUseThreadStates() || + TeamState.ICVState.LevelVar == 0)) return TeamState.ICVState.*Var; uint32_t TId = mapping::getThreadIdInBlock(); if (!ThreadStates[TId]) { @@ -299,13 +300,13 @@ uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var, IdentTy *Ident) { uint32_t &lookup32Impl(uint32_t ICVStateTy::*Var) { uint32_t TId = mapping::getThreadIdInBlock(); - if (OMP_UNLIKELY(ThreadStates[TId])) + if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId])) return ThreadStates[TId]->ICVState.*Var; return TeamState.ICVState.*Var; } uint64_t &lookup64Impl(uint64_t ICVStateTy::*Var) { uint64_t TId = mapping::getThreadIdInBlock(); - if (OMP_UNLIKELY(ThreadStates[TId])) + if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId])) return ThreadStates[TId]->ICVState.*Var; return TeamState.ICVState.*Var; } @@ -380,6 +381,9 @@ void state::init(bool IsSPMD) { } void state::enterDataEnvironment(IdentTy *Ident) { + ASSERT(config::mayUseThreadStates() && + "Thread state modified while explicitly disabled!"); + unsigned TId = mapping::getThreadIdInBlock(); ThreadStateTy *NewThreadState = static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy))); @@ -388,6 +392,9 @@ void state::enterDataEnvironment(IdentTy *Ident) { } void state::exitDataEnvironment() { + ASSERT(config::mayUseThreadStates() && + "Thread state modified while explicitly disabled!"); + unsigned TId = mapping::getThreadIdInBlock(); resetStateForThread(TId); } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits