https://github.com/kevinsala updated https://github.com/llvm/llvm-project/pull/152831
>From fa3c7425ae9e5ffea83841f2be61b0f494b99038 Mon Sep 17 00:00:00 2001 From: Kevin Sala <salapenad...@llnl.gov> Date: Fri, 8 Aug 2025 11:25:14 -0700 Subject: [PATCH 1/2] [OpenMP][Offload] Add offload runtime support for dyn_groupprivate clause --- offload/DeviceRTL/include/DeviceTypes.h | 4 + offload/DeviceRTL/include/Interface.h | 2 +- offload/DeviceRTL/include/State.h | 2 +- offload/DeviceRTL/src/Kernel.cpp | 14 +- offload/DeviceRTL/src/State.cpp | 48 +++++- offload/include/Shared/APITypes.h | 6 +- offload/include/Shared/Environment.h | 4 +- offload/include/device.h | 3 + offload/include/omptarget.h | 7 +- offload/libomptarget/OpenMP/API.cpp | 14 ++ offload/libomptarget/device.cpp | 6 + offload/libomptarget/exports | 1 + .../amdgpu/dynamic_hsa/hsa_ext_amd.h | 1 + offload/plugins-nextgen/amdgpu/src/rtl.cpp | 34 +++-- .../common/include/PluginInterface.h | 33 +++- .../common/src/PluginInterface.cpp | 86 ++++++++--- .../plugins-nextgen/cuda/dynamic_cuda/cuda.h | 1 + offload/plugins-nextgen/cuda/src/rtl.cpp | 37 +++-- offload/plugins-nextgen/host/src/rtl.cpp | 4 +- .../offloading/dyn_groupprivate_strict.cpp | 141 ++++++++++++++++++ openmp/runtime/src/include/omp.h.var | 10 ++ openmp/runtime/src/kmp_csupport.cpp | 9 ++ openmp/runtime/src/kmp_stub.cpp | 16 ++ 23 files changed, 418 insertions(+), 65 deletions(-) create mode 100644 offload/test/offloading/dyn_groupprivate_strict.cpp diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h index 2e5d92380f040..a43b506d6879e 100644 --- a/offload/DeviceRTL/include/DeviceTypes.h +++ b/offload/DeviceRTL/include/DeviceTypes.h @@ -163,4 +163,8 @@ typedef enum omp_allocator_handle_t { ///} +enum omp_access_t { + omp_access_cgroup = 0, +}; + #endif diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h index c4bfaaa2404b4..672afea206785 100644 --- a/offload/DeviceRTL/include/Interface.h +++ b/offload/DeviceRTL/include/Interface.h @@ -222,7 +222,7 @@ struct KernelEnvironmentTy; int8_t __kmpc_is_spmd_exec_mode(); int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment); + KernelLaunchEnvironmentTy *KernelLaunchEnvironment); void __kmpc_target_deinit(); diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h index db396dae6e445..17c3c6f2d3e42 100644 --- a/offload/DeviceRTL/include/State.h +++ b/offload/DeviceRTL/include/State.h @@ -116,7 +116,7 @@ extern Local<ThreadStateTy **> ThreadStates; /// Initialize the state machinery. Must be called by all threads. void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment); + KernelLaunchEnvironmentTy *KernelLaunchEnvironment); /// Return the kernel and kernel launch environment associated with the current /// kernel. The former is static and contains compile time information that diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp index 467e44a65276c..58e9a09105a76 100644 --- a/offload/DeviceRTL/src/Kernel.cpp +++ b/offload/DeviceRTL/src/Kernel.cpp @@ -34,8 +34,8 @@ enum OMPTgtExecModeFlags : unsigned char { }; static void -inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { +initializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, + KernelLaunchEnvironmentTy *KernelLaunchEnvironment) { // Order is important here. synchronize::init(IsSPMD); mapping::init(IsSPMD); @@ -80,17 +80,17 @@ extern "C" { /// \param Ident Source location identification, can be NULL. /// int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { + KernelLaunchEnvironmentTy *KernelLaunchEnvironment) { ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration; bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD; bool UseGenericStateMachine = Configuration.UseGenericStateMachine; if (IsSPMD) { - inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment, - KernelLaunchEnvironment); + initializeRuntime(/*IsSPMD=*/true, KernelEnvironment, + KernelLaunchEnvironment); synchronize::threadsAligned(atomic::relaxed); } else { - inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment, - KernelLaunchEnvironment); + initializeRuntime(/*IsSPMD=*/false, KernelEnvironment, + KernelLaunchEnvironment); // No need to wait since only the main threads will execute user // code and workers will run into a barrier right away. } diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp index 62b03e7bba720..9e2a9999167b4 100644 --- a/offload/DeviceRTL/src/State.cpp +++ b/offload/DeviceRTL/src/State.cpp @@ -158,6 +158,34 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) { memory::freeGlobal(Ptr, "Slow path shared memory deallocation"); } +struct DynCGroupMemTy { + void init(KernelLaunchEnvironmentTy *KLE, void *NativeDynCGroup) { + Size = 0; + Ptr = nullptr; + IsFallback = false; + if (KLE) { + Size = KLE->DynCGroupMemSize; + if (void *Fallback = KLE->DynCGroupMemFallback) { + Ptr = static_cast<char *>(Fallback) + Size * omp_get_team_num(); + IsFallback = true; + } else { + Ptr = static_cast<char *>(NativeDynCGroup); + } + } + } + + char *getPtr(size_t Offset) const { return Ptr + Offset; } + bool isFallback() const { return IsFallback; } + size_t getSize() const { return Size; } + +private: + char *Ptr; + size_t Size; + bool IsFallback; +}; + +[[clang::loader_uninitialized]] static Local<DynCGroupMemTy> DynCGroupMem; + } // namespace void *memory::getDynamicBuffer() { return DynamicSharedBuffer; } @@ -246,13 +274,18 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, } // namespace void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { + KernelLaunchEnvironmentTy *KLE) { SharedMemorySmartStack.init(IsSPMD); + + if (KLE == reinterpret_cast<KernelLaunchEnvironmentTy *>(~0)) + KLE = nullptr; + if (mapping::isInitialThreadInLevel0(IsSPMD)) { + DynCGroupMem.init(KLE, DynamicSharedBuffer); TeamState.init(IsSPMD); ThreadStates = nullptr; KernelEnvironmentPtr = &KernelEnvironment; - KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment; + KernelLaunchEnvironmentPtr = KLE; } } @@ -430,6 +463,17 @@ int omp_get_team_num() { return mapping::getBlockIdInKernel(); } int omp_get_initial_device(void) { return -1; } int omp_is_initial_device(void) { return 0; } + +void *omp_get_dyn_groupprivate_ptr(size_t Offset, int *IsFallback, + omp_access_t) { + if (IsFallback != NULL) + *IsFallback = DynCGroupMem.isFallback(); + return DynCGroupMem.getPtr(Offset); +} + +size_t omp_get_dyn_groupprivate_size(omp_access_t) { + return DynCGroupMem.getSize(); +} } extern "C" { diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h index 978b53d5d69b9..0ef2dd162292b 100644 --- a/offload/include/Shared/APITypes.h +++ b/offload/include/Shared/APITypes.h @@ -97,8 +97,10 @@ struct KernelArgsTy { struct { uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause. uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA. - uint64_t Unused : 62; - } Flags = {0, 0, 0}; + uint64_t AllowDynCGroupMemFallback : 1; // Allow fallback for dynamic cgroup + // mem fallback. + uint64_t Unused : 61; + } Flags = {0, 0, 0, 0}; // The number of teams (for x,y,z dimension). uint32_t NumTeams[3] = {0, 0, 0}; // The number of threads (for x,y,z dimension). diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h index 2a283bd6fa4ed..0670ac1090da4 100644 --- a/offload/include/Shared/Environment.h +++ b/offload/include/Shared/Environment.h @@ -93,9 +93,11 @@ struct KernelEnvironmentTy { }; struct KernelLaunchEnvironmentTy { + void *ReductionBuffer = nullptr; + void *DynCGroupMemFallback = nullptr; uint32_t ReductionCnt = 0; uint32_t ReductionIterCnt = 0; - void *ReductionBuffer = nullptr; + uint32_t DynCGroupMemSize = 0; }; #endif // OMPTARGET_SHARED_ENVIRONMENT_H diff --git a/offload/include/device.h b/offload/include/device.h index f4b10abbaa3fd..0e93cf8ec1a8b 100644 --- a/offload/include/device.h +++ b/offload/include/device.h @@ -158,6 +158,9 @@ struct DeviceTy { /// Indicate that there are pending images for this device or not. void setHasPendingImages(bool V) { HasPendingImages = V; } + /// Get the maximum shared memory per team for any kernel. + uint64_t getMaxSharedTeamMemory(); + private: /// Deinitialize the device (and plugin). void deinit(); diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h index 6971780c7bdb5..45bb74ec367d6 100644 --- a/offload/include/omptarget.h +++ b/offload/include/omptarget.h @@ -107,7 +107,7 @@ enum TargetAllocTy : int32_t { inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, - 0, {0,0,0}, {1, 0, 0}, {1, 0, 0}, 0}; + 0, {0,0,0,0}, {1, 0, 0}, {1, 0, 0}, 0}; struct DeviceTy; @@ -273,10 +273,15 @@ struct __tgt_target_non_contig { extern "C" { #endif +typedef enum { + omp_access_cgroup = 0, +} omp_access_t; + void ompx_dump_mapping_tables(void); int omp_get_num_devices(void); int omp_get_device_num(void); int omp_get_initial_device(void); +size_t omp_get_groupprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup); void *omp_target_alloc(size_t Size, int DeviceNum); void omp_target_free(void *DevicePtr, int DeviceNum); int omp_target_is_present(const void *Ptr, int DeviceNum); diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp index 4576f9bd06121..1ed4192157fc8 100644 --- a/offload/libomptarget/OpenMP/API.cpp +++ b/offload/libomptarget/OpenMP/API.cpp @@ -98,6 +98,20 @@ EXTERN int omp_get_initial_device(void) { return HostDevice; } +EXTERN size_t omp_get_groupprivate_limit(int DeviceNum, + omp_access_t AccessGroup) { + TIMESCOPE(); + OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); + if (DeviceNum == omp_get_initial_device()) + return 0; + + auto DeviceOrErr = PM->getDevice(DeviceNum); + if (!DeviceOrErr) + FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str()); + + return DeviceOrErr->getMaxSharedTeamMemory(); +} + EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) { TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) + ";size=" + std::to_string(Size)); diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp index f88e30ae9e76b..31bfc7d092424 100644 --- a/offload/libomptarget/device.cpp +++ b/offload/libomptarget/device.cpp @@ -281,3 +281,9 @@ bool DeviceTy::useAutoZeroCopy() { return false; return RTL->use_auto_zero_copy(RTLDeviceID); } + +uint64_t DeviceTy::getMaxSharedTeamMemory() { + using DeviceQueryKind = llvm::omp::target::plugin::DeviceQueryKind; + return RTL->query_device_info( + RTLDeviceID, DeviceQueryKind::DEVICE_QUERY_MAX_SHARED_TEAM_MEM); +} diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports index 2406776c1fb5f..b5a1401564d58 100644 --- a/offload/libomptarget/exports +++ b/offload/libomptarget/exports @@ -40,6 +40,7 @@ VERS1.0 { omp_get_num_devices; omp_get_device_num; omp_get_initial_device; + omp_get_groupprivate_limit; omp_target_alloc; omp_target_free; omp_target_is_present; diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h index 3117763e35896..2cf156e576c5f 100644 --- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h +++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h @@ -52,6 +52,7 @@ typedef enum { HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15, + HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16, } hsa_amd_memory_pool_info_t; typedef enum { diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 12c7cc62905c9..fa373c2029f0c 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -273,7 +273,6 @@ struct AMDGPUMemoryPoolTy { if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags)) return Err; - return Plugin::success(); } @@ -543,6 +542,8 @@ struct AMDGPUKernelTy : public GenericKernelTy { return Err; } + StaticBlockMemSize = GroupSize; + // Make sure it is a kernel symbol. if (SymbolType != HSA_SYMBOL_KIND_KERNEL) return Plugin::error(ErrorCode::INVALID_BINARY, @@ -566,8 +567,8 @@ struct AMDGPUKernelTy : public GenericKernelTy { /// Launch the AMDGPU kernel function. Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], - uint32_t NumBlocks[3], KernelArgsTy &KernelArgs, - KernelLaunchParamsTy LaunchParams, + uint32_t NumBlocks[3], uint32_t DynBlockMemSize, + KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; /// Print more elaborate kernel launch info for AMDGPU @@ -2020,6 +2021,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = checkIfAPU()) return Err; + // Retrieve the size of the group memory. + for (const auto *Pool : AllMemoryPools) { + if (Pool->isGroup()) { + size_t Size = 0; + if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE, Size)) + return Err; + MaxBlockSharedMemSize = Size; + break; + } + } + + // Supports block shared memory natively. + HasNativeBlockSharedMem = true; + return Plugin::success(); } @@ -2856,7 +2871,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { KernelArgsTy KernelArgs = {}; uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u}; if (auto Err = AMDGPUKernel.launchImpl( - *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs, + *this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs, KernelLaunchParamsTy{}, AsyncInfoWrapper)) return Err; @@ -3357,6 +3372,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy { Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], uint32_t NumBlocks[3], + uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const { @@ -3374,13 +3390,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs)) return Err; - // Account for user requested dynamic shared memory. - uint32_t GroupSize = getGroupSize(); - if (uint32_t MaxDynCGroupMem = std::max( - KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) { - GroupSize += MaxDynCGroupMem; - } - uint64_t StackSize; if (auto Err = GenericDevice.getDeviceStackSize(StackSize)) return Err; @@ -3434,7 +3443,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, // Push the kernel launch into the stream. return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks, - GroupSize, StackSize, ArgsMemoryManager); + getStaticBlockMemSize() + DynBlockMemSize, + StackSize, ArgsMemoryManager); } Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 162b149ab483e..3357ccfe0c9b5 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -226,6 +226,10 @@ struct InfoTreeNode { } }; +enum class DeviceQueryKind { + DEVICE_QUERY_MAX_SHARED_TEAM_MEM = 0, +}; + /// Class wrapping a __tgt_device_image and its offload entry table on a /// specific device. This class is responsible for storing and managing /// the offload entries for an image on a device. @@ -312,13 +316,16 @@ struct GenericKernelTy { AsyncInfoWrapperTy &AsyncInfoWrapper) const; virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], uint32_t NumBlocks[3], - KernelArgsTy &KernelArgs, + uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0; /// Get the kernel name. const char *getName() const { return Name.c_str(); } + /// Get the size of the static per-block memory consumed by the kernel. + uint32_t getStaticBlockMemSize() const { return StaticBlockMemSize; }; + /// Get the kernel image. DeviceImageTy &getImage() const { assert(ImagePtr && "Kernel is not initialized!"); @@ -331,9 +338,9 @@ struct GenericKernelTy { } /// Return a device pointer to a new kernel launch environment. - Expected<KernelLaunchEnvironmentTy *> - getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version, - AsyncInfoWrapperTy &AsyncInfo) const; + Expected<KernelLaunchEnvironmentTy *> getKernelLaunchEnvironment( + GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs, + void *FallbackBlockMem, AsyncInfoWrapperTy &AsyncInfo) const; /// Indicate whether an execution mode is valid. static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) { @@ -425,6 +432,9 @@ struct GenericKernelTy { /// The maximum number of threads which the kernel could leverage. uint32_t MaxNumThreads; + /// The static memory sized per block. + uint32_t StaticBlockMemSize = 0; + /// The kernel environment, including execution flags. KernelEnvironmentTy KernelEnvironment; @@ -731,6 +741,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// this id is not unique between different plugins; they may overlap. int32_t getDeviceId() const { return DeviceId; } + /// Get the total shared memory per block that can be used in any kernel. + uint32_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; } + + /// Indicate whether the device has native block shared memory. + bool hasNativeBlockSharedMem() const { return HasNativeBlockSharedMem; } + /// Set the context of the device if needed, before calling device-specific /// functions. Plugins may implement this function as a no-op if not needed. virtual Error setContext() = 0; @@ -1132,6 +1148,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy { std::atomic<bool> OmptInitialized; #endif + /// The total per-block shared memory that a kernel may use. + uint32_t MaxBlockSharedMemSize = 0; + + /// Whether the device has native block shared memory. + bool HasNativeBlockSharedMem = false; + private: DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0}; DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0}; @@ -1347,6 +1369,9 @@ struct GenericPluginTy { /// Prints information about the given devices supported by the plugin. void print_device_info(int32_t DeviceId); + /// Retrieve information about the given device. + int64_t query_device_info(int32_t DeviceId, DeviceQueryKind Query); + /// Creates an event in the given plugin if supported. int32_t create_event(int32_t DeviceId, void **EventPtr); diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 81b9d423e13d8..2997585e1660f 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -477,20 +477,20 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice, Expected<KernelLaunchEnvironmentTy *> GenericKernelTy::getKernelLaunchEnvironment( - GenericDeviceTy &GenericDevice, uint32_t Version, - AsyncInfoWrapperTy &AsyncInfoWrapper) const { + GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs, + void *FallbackBlockMem, AsyncInfoWrapperTy &AsyncInfoWrapper) const { // Ctor/Dtor have no arguments, replaying uses the original kernel launch // environment. Older versions of the compiler do not generate a kernel // launch environment. if (GenericDevice.Plugin.getRecordReplay().isReplaying() || - Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR) + KernelArgs.Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR) return nullptr; - if (!KernelEnvironment.Configuration.ReductionDataSize || - !KernelEnvironment.Configuration.ReductionBufferLength) + if ((!KernelEnvironment.Configuration.ReductionDataSize || + !KernelEnvironment.Configuration.ReductionBufferLength) && + KernelArgs.DynCGroupMem == 0) return reinterpret_cast<KernelLaunchEnvironmentTy *>(~0); - // TODO: Check if the kernel needs a launch environment. auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy), /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE); @@ -504,7 +504,9 @@ GenericKernelTy::getKernelLaunchEnvironment( /// async data transfer. auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment; LocalKLE = KernelLaunchEnvironment; - { + + if (KernelEnvironment.Configuration.ReductionDataSize && + KernelEnvironment.Configuration.ReductionBufferLength) { auto AllocOrErr = GenericDevice.dataAlloc( KernelEnvironment.Configuration.ReductionDataSize * KernelEnvironment.Configuration.ReductionBufferLength, @@ -514,8 +516,13 @@ GenericKernelTy::getKernelLaunchEnvironment( LocalKLE.ReductionBuffer = *AllocOrErr; // Remember to free the memory later. AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr); + } else { + LocalKLE.ReductionBuffer = nullptr; } + LocalKLE.DynCGroupMemSize = KernelArgs.DynCGroupMem; + LocalKLE.DynCGroupMemFallback = FallbackBlockMem; + INFO(OMP_INFOTYPE_DATA_TRANSFER, GenericDevice.getDeviceId(), "Copying data from host to device, HstPtr=" DPxMOD ", TgtPtr=" DPxMOD ", Size=%" PRId64 ", Name=KernelLaunchEnv\n", @@ -556,8 +563,45 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, llvm::SmallVector<void *, 16> Args; llvm::SmallVector<void *, 16> Ptrs; + uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0], + KernelArgs.ThreadLimit[1], + KernelArgs.ThreadLimit[2]}; + uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1], + KernelArgs.NumTeams[2]}; + if (!isBareMode()) { + NumThreads[0] = getNumThreads(GenericDevice, NumThreads); + NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount, + NumThreads[0], KernelArgs.ThreadLimit[0] > 0); + } + + uint32_t MaxBlockMemSize = GenericDevice.getMaxBlockSharedMemSize(); + uint32_t DynBlockMemSize = KernelArgs.DynCGroupMem; + uint32_t TotalBlockMemSize = StaticBlockMemSize + DynBlockMemSize; + if (StaticBlockMemSize > MaxBlockMemSize) + return Plugin::error(ErrorCode::INVALID_ARGUMENT, + "Static block memory size exceeds maximum"); + else if (!KernelArgs.Flags.AllowDynCGroupMemFallback && + TotalBlockMemSize > MaxBlockMemSize) + return Plugin::error( + ErrorCode::INVALID_ARGUMENT, + "Static and dynamic block memory size exceeds maximum"); + + void *FallbackBlockMem = nullptr; + if (DynBlockMemSize && (!GenericDevice.hasNativeBlockSharedMem() || + TotalBlockMemSize > MaxBlockMemSize)) { + auto AllocOrErr = GenericDevice.dataAlloc( + NumBlocks[0] * DynBlockMemSize, + /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE); + if (!AllocOrErr) + return AllocOrErr.takeError(); + + FallbackBlockMem = *AllocOrErr; + AsyncInfoWrapper.freeAllocationAfterSynchronization(FallbackBlockMem); + DynBlockMemSize = 0; + } + auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment( - GenericDevice, KernelArgs.Version, AsyncInfoWrapper); + GenericDevice, KernelArgs, FallbackBlockMem, AsyncInfoWrapper); if (!KernelLaunchEnvOrErr) return KernelLaunchEnvOrErr.takeError(); @@ -573,17 +617,6 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, Args, Ptrs, *KernelLaunchEnvOrErr); } - uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0], - KernelArgs.ThreadLimit[1], - KernelArgs.ThreadLimit[2]}; - uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1], - KernelArgs.NumTeams[2]}; - if (!isBareMode()) { - NumThreads[0] = getNumThreads(GenericDevice, NumThreads); - NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount, - NumThreads[0], KernelArgs.ThreadLimit[0] > 0); - } - // Record the kernel description after we modified the argument count and num // blocks/threads. RecordReplayTy &RecordReplay = GenericDevice.Plugin.getRecordReplay(); @@ -599,8 +632,8 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks)) return Err; - return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs, - LaunchParams, AsyncInfoWrapper); + return launchImpl(GenericDevice, NumThreads, NumBlocks, DynBlockMemSize, + KernelArgs, LaunchParams, AsyncInfoWrapper); } KernelLaunchParamsTy GenericKernelTy::prepareArgs( @@ -2077,6 +2110,17 @@ void GenericPluginTy::print_device_info(int32_t DeviceId) { toString(std::move(Err)).data()); } +int64_t GenericPluginTy::query_device_info(int32_t DeviceId, + DeviceQueryKind Query) { + const GenericDeviceTy &Device = getDevice(DeviceId); + + switch (Query) { + case DeviceQueryKind::DEVICE_QUERY_MAX_SHARED_TEAM_MEM: + return Device.getMaxBlockSharedMemSize(); + } + return 0; +} + int32_t GenericPluginTy::create_event(int32_t DeviceId, void **EventPtr) { auto Err = getDevice(DeviceId).createEvent(EventPtr); if (Err) { diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h index b6c022c8e7e8b..b6e087edea876 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h @@ -258,6 +258,7 @@ typedef enum CUdevice_attribute_enum { typedef enum CUfunction_attribute_enum { CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, + CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, } CUfunction_attribute; diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index 15193de6ae430..eda7a85f750f0 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -148,13 +148,21 @@ struct CUDAKernelTy : public GenericKernelTy { // The maximum number of threads cannot exceed the maximum of the kernel. MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads); + int SharedMemSize; + Res = cuFuncGetAttribute(&SharedMemSize, + CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, Func); + if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s")) + return Err; + + StaticBlockMemSize = SharedMemSize; + return Plugin::success(); } /// Launch the CUDA kernel function. Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], - uint32_t NumBlocks[3], KernelArgsTy &KernelArgs, - KernelLaunchParamsTy LaunchParams, + uint32_t NumBlocks[3], uint32_t DynBlockMemSize, + KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; private: @@ -162,7 +170,7 @@ struct CUDAKernelTy : public GenericKernelTy { CUfunction Func; /// The maximum amount of dynamic shared memory per thread group. By default, /// this is set to 48 KB. - mutable uint32_t MaxDynCGroupMemLimit = 49152; + mutable uint32_t MaxDynBlockMemSize = 49152; }; /// Class wrapping a CUDA stream reference. These are the objects handled by the @@ -358,6 +366,15 @@ struct CUDADeviceTy : public GenericDeviceTy { return Err; HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize); + uint32_t MaxSharedMem; + if (auto Err = getDeviceAttr( + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, MaxSharedMem)) + return Err; + MaxBlockSharedMemSize = MaxSharedMem; + + // Supports block shared memory natively. + HasNativeBlockSharedMem = true; + return Plugin::success(); } @@ -1239,7 +1256,7 @@ struct CUDADeviceTy : public GenericDeviceTy { KernelArgsTy KernelArgs = {}; uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u}; if (auto Err = CUDAKernel.launchImpl( - *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs, + *this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs, KernelLaunchParamsTy{}, AsyncInfoWrapper)) return Err; @@ -1285,6 +1302,7 @@ struct CUDADeviceTy : public GenericDeviceTy { Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], uint32_t NumBlocks[3], + uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const { @@ -1294,9 +1312,6 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream)) return Err; - uint32_t MaxDynCGroupMem = - std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize()); - void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data, CU_LAUNCH_PARAM_BUFFER_SIZE, reinterpret_cast<void *>(&LaunchParams.Size), @@ -1308,18 +1323,18 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, GenericDevice.Plugin.getRPCServer().Thread->notify(); // In case we require more memory than the current limit. - if (MaxDynCGroupMem >= MaxDynCGroupMemLimit) { + if (DynBlockMemSize >= MaxDynBlockMemSize) { CUresult AttrResult = cuFuncSetAttribute( - Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem); + Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, DynBlockMemSize); Plugin::check( AttrResult, "Error in cuLaunchKernel while setting the memory limits: %s"); - MaxDynCGroupMemLimit = MaxDynCGroupMem; + MaxDynBlockMemSize = DynBlockMemSize; } CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2], NumThreads[0], NumThreads[1], NumThreads[2], - MaxDynCGroupMem, Stream, nullptr, Config); + DynBlockMemSize, Stream, nullptr, Config); // Register a callback to indicate when the kernel is complete. if (GenericDevice.getRPCServer()) diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index d950572265b4c..dc82a2ef16e51 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -92,8 +92,8 @@ struct GenELF64KernelTy : public GenericKernelTy { /// Launch the kernel using the libffi. Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], - uint32_t NumBlocks[3], KernelArgsTy &KernelArgs, - KernelLaunchParamsTy LaunchParams, + uint32_t NumBlocks[3], uint32_t DynBlockMemSize, + KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override { // Create a vector of ffi_types, one per argument. SmallVector<ffi_type *, 16> ArgTypes(KernelArgs.NumArgs, &ffi_type_pointer); diff --git a/offload/test/offloading/dyn_groupprivate_strict.cpp b/offload/test/offloading/dyn_groupprivate_strict.cpp new file mode 100644 index 0000000000000..a35f8dd2b0595 --- /dev/null +++ b/offload/test/offloading/dyn_groupprivate_strict.cpp @@ -0,0 +1,141 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic +// REQUIRES: gpu + +#include <omp.h> +#include <stdio.h> + +#define N 512 + +int main() { + int Result[N], NumThreads; + +#pragma omp target teams num_teams(1) thread_limit(N) \ + dyn_groupprivate(strict : N * sizeof(Result[0])) \ + map(from : Result, NumThreads) + { + int Buffer[N]; +#pragma omp parallel + { + int *DynBuffer = (int *)omp_get_dyn_groupprivate_ptr(); + int TId = omp_get_thread_num(); + if (TId == 0) + NumThreads = omp_get_num_threads(); + Buffer[TId] = 7; + DynBuffer[TId] = 3; +#pragma omp barrier + int WrappedTId = (TId + 37) % NumThreads; + Result[TId] = Buffer[WrappedTId] + DynBuffer[WrappedTId]; + } + } + + if (NumThreads < N / 2 || NumThreads > N) { + printf("Expected number of threads to be in [%i:%i], but got: %i", N / 2, N, + NumThreads); + return -1; + } + + int Failed = 0; + for (int i = 0; i < NumThreads; ++i) { + if (Result[i] != 7 + 3) { + printf("Result[%i] is %i, expected %i\n", i, Result[i], 7 + 3); + ++Failed; + } + } + + // Verify that the routines in the host returns NULL and zero. + if (omp_get_dyn_groupprivate_ptr()) + ++Failed; + if (omp_get_dyn_groupprivate_size()) + ++Failed; + + size_t MaxSize = omp_get_groupprivate_limit(0, omp_access_cgroup); + size_t ExceededSize = MaxSize + 10; + +// Verify that the fallback modifier works. +#pragma omp target dyn_groupprivate(fallback : ExceededSize) \ + map(tofrom : Failed) + { + int IsFallback; + if (!omp_get_dyn_groupprivate_ptr(0, &IsFallback)) + ++Failed; + if (!omp_get_dyn_groupprivate_size()) + ++Failed; + if (omp_get_dyn_groupprivate_size() != ExceededSize) + ++Failed; + if (!IsFallback) + ++Failed; + } + +// Verify that the default modifier is fallback. +#pragma omp target dyn_groupprivate(ExceededSize) + { + } + +// Verify that the strict modifier works. +#pragma omp target dyn_groupprivate(strict : N) map(tofrom : Failed) + { + int IsFallback; + if (!omp_get_dyn_groupprivate_ptr(0, &IsFallback)) + ++Failed; + if (!omp_get_dyn_groupprivate_size()) + ++Failed; + if (omp_get_dyn_groupprivate_size() != N) + ++Failed; + if (IsFallback) + ++Failed; + } + +// Verify that the fallback does not trigger when not needed. +#pragma omp target dyn_groupprivate(fallback : N) map(tofrom : Failed) + { + int IsFallback; + if (!omp_get_dyn_groupprivate_ptr(0, &IsFallback)) + ++Failed; + if (!omp_get_dyn_groupprivate_size()) + ++Failed; + if (omp_get_dyn_groupprivate_size() != N) + ++Failed; + if (IsFallback) + ++Failed; + } + +// Verify that the clause works when passing a zero size. +#pragma omp target dyn_groupprivate(strict : 0) map(tofrom : Failed) + { + int IsFallback; + if (omp_get_dyn_groupprivate_ptr(0, &IsFallback)) + ++Failed; + if (omp_get_dyn_groupprivate_size()) + ++Failed; + if (IsFallback) + ++Failed; + } + +// Verify that the clause works when passing a zero size. +#pragma omp target dyn_groupprivate(fallback : 0) map(tofrom : Failed) + { + int IsFallback; + if (omp_get_dyn_groupprivate_ptr(0, &IsFallback)) + ++Failed; + if (omp_get_dyn_groupprivate_size()) + ++Failed; + if (IsFallback) + ++Failed; + } + +// Verify that omitting the clause is the same as setting zero size. +#pragma omp target map(tofrom : Failed) + { + int IsFallback; + if (omp_get_dyn_groupprivate_ptr(0, &IsFallback)) + ++Failed; + if (omp_get_dyn_groupprivate_size()) + ++Failed; + if (IsFallback) + ++Failed; + } + + // CHECK: PASS + if (!Failed) + printf("PASS\n"); +} diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var index 74f385feb3ea5..26c3df56a9ce3 100644 --- a/openmp/runtime/src/include/omp.h.var +++ b/openmp/runtime/src/include/omp.h.var @@ -380,6 +380,10 @@ omp_uintptr_t value; } omp_alloctrait_t; + typedef enum { + omp_access_cgroup = 0, + } omp_access_t; + # if defined(_WIN32) // On Windows cl and icl do not support 64-bit enum, let's use integer then. typedef omp_uintptr_t omp_allocator_handle_t; @@ -463,6 +467,9 @@ omp_allocator_handle_t allocator = omp_null_allocator, omp_allocator_handle_t free_allocator = omp_null_allocator); extern void __KAI_KMPC_CONVENTION omp_free(void * ptr, omp_allocator_handle_t a = omp_null_allocator); + extern void *__KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_ptr(size_t offset = 0, int *is_fallback = NULL, omp_access_t access_group = omp_access_cgroup); + extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_size(omp_access_t access_group = omp_access_cgroup); + extern size_t __KAI_KMPC_CONVENTION omp_get_groupprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup); # else extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a); extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size, @@ -473,6 +480,9 @@ extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator, omp_allocator_handle_t free_allocator); extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a); + extern void *__KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback, omp_access_t access_group); + extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_size(omp_access_t access_group); + extern size_t __KAI_KMPC_CONVENTION omp_get_groupprivate_limit(int device_num, omp_access_t access_group); # endif /* OpenMP TR11 routines to get memory spaces and allocators */ diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp index 3ca32ba583fe2..9605bad457e11 100644 --- a/openmp/runtime/src/kmp_csupport.cpp +++ b/openmp/runtime/src/kmp_csupport.cpp @@ -4515,6 +4515,15 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) { } /* end of OpenMP 5.1 Memory Management routines */ +void *omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback, + omp_access_t access_group) { + if (is_fallback != NULL) + *is_fallback = 0; + return NULL; +} + +size_t omp_get_dyn_groupprivate_size(omp_access_t access_group) { return 0; } + int __kmpc_get_target_offload(void) { if (!__kmp_init_serial) { __kmp_serial_initialize(); diff --git a/openmp/runtime/src/kmp_stub.cpp b/openmp/runtime/src/kmp_stub.cpp index 06276d1bed1c7..a099f887b6ba4 100644 --- a/openmp/runtime/src/kmp_stub.cpp +++ b/openmp/runtime/src/kmp_stub.cpp @@ -454,6 +454,22 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) { #endif } +void *omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback, + omp_access_t access_group) { + i; + return NULL; +} + +size_t omp_get_dyn_groupprivate_size(omp_access_t access_group) { + i; + return 0; +} + +size_t omp_get_groupprivate_limit(int device_num, omp_access_t access_group) { + i; + return 0; +} + /* OpenMP 5.0 Affinity Format */ void omp_set_affinity_format(char const *format) { i; } size_t omp_get_affinity_format(char *buffer, size_t size) { >From 20a96c655deafe863233c1f5e18602b4239eb229 Mon Sep 17 00:00:00 2001 From: Kevin Sala <salapenad...@llnl.gov> Date: Sat, 9 Aug 2025 22:50:23 -0700 Subject: [PATCH 2/2] Add fixes --- offload/DeviceRTL/include/DeviceTypes.h | 5 +++++ offload/DeviceRTL/src/State.cpp | 19 ++++++++++--------- offload/include/omptarget.h | 15 +++++++++++---- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 3 +++ offload/plugins-nextgen/cuda/src/rtl.cpp | 3 +++ openmp/runtime/src/kmp_csupport.cpp | 2 +- 6 files changed, 33 insertions(+), 14 deletions(-) diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h index a43b506d6879e..042fef45917b0 100644 --- a/offload/DeviceRTL/include/DeviceTypes.h +++ b/offload/DeviceRTL/include/DeviceTypes.h @@ -163,8 +163,13 @@ typedef enum omp_allocator_handle_t { ///} +/// The OpenMP access group type. The criterion for grupping tasks using a +/// specific grouping property. enum omp_access_t { + /// Groups the tasks based on the contention group to which they belong. omp_access_cgroup = 0, + /// Groups the tasks based on the parallel region to which they bind. + omp_access_pteam = 1, }; #endif diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp index 9e2a9999167b4..c6bc6a140f5f2 100644 --- a/offload/DeviceRTL/src/State.cpp +++ b/offload/DeviceRTL/src/State.cpp @@ -163,14 +163,15 @@ struct DynCGroupMemTy { Size = 0; Ptr = nullptr; IsFallback = false; - if (KLE) { - Size = KLE->DynCGroupMemSize; - if (void *Fallback = KLE->DynCGroupMemFallback) { - Ptr = static_cast<char *>(Fallback) + Size * omp_get_team_num(); - IsFallback = true; - } else { - Ptr = static_cast<char *>(NativeDynCGroup); - } + if (!KLE) + return; + + Size = KLE->DynCGroupMemSize; + if (void *Fallback = KLE->DynCGroupMemFallback) { + Ptr = static_cast<char *>(Fallback) + Size * omp_get_team_num(); + IsFallback = true; + } else { + Ptr = static_cast<char *>(NativeDynCGroup); } } @@ -466,7 +467,7 @@ int omp_is_initial_device(void) { return 0; } void *omp_get_dyn_groupprivate_ptr(size_t Offset, int *IsFallback, omp_access_t) { - if (IsFallback != NULL) + if (IsFallback != nullptr) *IsFallback = DynCGroupMem.isFallback(); return DynCGroupMem.getPtr(Offset); } diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h index 45bb74ec367d6..ddb0f7f88d2e0 100644 --- a/offload/include/omptarget.h +++ b/offload/include/omptarget.h @@ -273,15 +273,22 @@ struct __tgt_target_non_contig { extern "C" { #endif -typedef enum { - omp_access_cgroup = 0, -} omp_access_t; +/// The OpenMP access group type. The criterion for grupping tasks using a +/// specific grouping property. +enum omp_access_t { + /// Groups the tasks based on the contention group to which they belong. + omp_access_cgroup = 0, + /// Groups the tasks based on the parallel region to which they bind. + omp_access_pteam = 1, +}; void ompx_dump_mapping_tables(void); int omp_get_num_devices(void); int omp_get_device_num(void); int omp_get_initial_device(void); -size_t omp_get_groupprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup); +size_t +omp_get_groupprivate_limit(int device_num, + omp_access_t access_group = omp_access_cgroup); void *omp_target_alloc(size_t Size, int DeviceNum); void omp_target_free(void *DevicePtr, int DeviceNum); int omp_target_is_present(const void *Ptr, int DeviceNum); diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index fa373c2029f0c..48e677d060e68 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -3441,6 +3441,9 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, KernelArgs.DynCGroupMem); } + // Increase to the requested dynamic memory size for the device if needed. + DynBlockMemSize = std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize()); + // Push the kernel launch into the stream. return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks, getStaticBlockMemSize() + DynBlockMemSize, diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index eda7a85f750f0..bd1cedf5632fc 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -1322,6 +1322,9 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, if (GenericDevice.getRPCServer()) GenericDevice.Plugin.getRPCServer().Thread->notify(); + // Increase to the requested dynamic memory size for the device if needed. + DynBlockMemSize = std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize()); + // In case we require more memory than the current limit. if (DynBlockMemSize >= MaxDynBlockMemSize) { CUresult AttrResult = cuFuncSetAttribute( diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp index 9605bad457e11..3ac62e5893f8b 100644 --- a/openmp/runtime/src/kmp_csupport.cpp +++ b/openmp/runtime/src/kmp_csupport.cpp @@ -4517,7 +4517,7 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) { void *omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback, omp_access_t access_group) { - if (is_fallback != NULL) + if (is_fallback != nullptr) *is_fallback = 0; return NULL; } _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits