https://github.com/jdoerfert updated https://github.com/llvm/llvm-project/pull/104168
>From bda519493667d3554365061475308da3786bfa9d Mon Sep 17 00:00:00 2001 From: Johannes Doerfert <johan...@jdoerfert.de> Date: Mon, 12 Aug 2024 11:53:06 -0700 Subject: [PATCH] [Offload] Provide a kernel library useable by the offload runtime As mentioned in #68706, it is useful to be able to call kernels from the runtime, e.g., to perform memset. This patch provides a kernel library that can be invoked from the offload runtime directly. --- clang/lib/Driver/ToolChains/CommonArgs.cpp | 7 +- offload/DeviceRTL/CMakeLists.txt | 1 + offload/include/device.h | 3 + offload/include/omptarget.h | 5 + offload/plugins-nextgen/amdgpu/src/rtl.cpp | 56 ++------- .../common/include/PluginInterface.h | 33 ++++-- .../common/src/PluginInterface.cpp | 107 ++++++++++++++++-- offload/plugins-nextgen/cuda/src/rtl.cpp | 33 +++--- offload/plugins-nextgen/host/src/rtl.cpp | 5 +- offload/src/CMakeLists.txt | 18 +++ offload/src/Kernels/Memory.cpp | 53 +++++++++ offload/src/OpenMP/API.cpp | 65 +++++++---- offload/src/device.cpp | 11 ++ offload/src/exports | 1 + offload/src/interface.cpp | 15 +++ offload/test/jit/type_punning.c | 4 +- offload/test/lit.cfg | 5 +- offload/test/offloading/kernels_memset.c | 61 ++++++++++ 18 files changed, 368 insertions(+), 115 deletions(-) create mode 100644 offload/src/Kernels/Memory.cpp create mode 100644 offload/test/offloading/kernels_memset.c diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 1cba3e1220264a..4080356c636dc2 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1202,8 +1202,11 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs, options::OPT_fno_openmp, false)) { // We need libomptarget (liboffload) if it's the choosen offloading runtime. if (Args.hasFlag(options::OPT_foffload_via_llvm, - options::OPT_fno_offload_via_llvm, false)) + options::OPT_fno_offload_via_llvm, false)) { CmdArgs.push_back("-lomptarget"); + if (!Args.hasArg(options::OPT_nogpulib)) + CmdArgs.append({"-lomptarget.devicertl", "-loffload.kernels"}); + } return false; } @@ -1240,7 +1243,7 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs, CmdArgs.push_back("-lomptarget"); if (IsOffloadingHost && !Args.hasArg(options::OPT_nogpulib)) - CmdArgs.push_back("-lomptarget.devicertl"); + CmdArgs.append({"-lomptarget.devicertl", "-loffload.kernels"}); addArchSpecificRPath(TC, Args, CmdArgs); diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt index 7818c8d752599c..e321047f781a3e 100644 --- a/offload/DeviceRTL/CMakeLists.txt +++ b/offload/DeviceRTL/CMakeLists.txt @@ -69,6 +69,7 @@ elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "auto" OR "${LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST};${LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST}") endif() list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES) +set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${LIBOMPTARGET_DEVICE_ARCHITECTURES} PARENT_SCOPE) set(include_files ${include_directory}/Allocator.h diff --git a/offload/include/device.h b/offload/include/device.h index 3132d35b7b38c8..d3415785708d62 100644 --- a/offload/include/device.h +++ b/offload/include/device.h @@ -124,6 +124,9 @@ struct DeviceTy { /// Calls the corresponding print device info function in the plugin. bool printDeviceInfo(); + /// Return the handle to the kernel with name \p Name in \p HandlePtr. + int32_t getKernelHandle(llvm::StringRef Name, void **HandlePtr); + /// Event related interfaces. /// { /// Create an event. diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h index 2b6445e9fbe550..f4ff5d33f7bf0f 100644 --- a/offload/include/omptarget.h +++ b/offload/include/omptarget.h @@ -400,6 +400,11 @@ void __tgt_target_data_update_nowait_mapper( int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, int32_t ThreadLimit, void *HostPtr, KernelArgsTy *Args); +/// Launch the kernel \p KernelName with a CUDA style launch and the given grid +/// sizes and arguments (\p KernelArgs). +int __tgt_launch_by_name(ident_t *Loc, int64_t DeviceId, const char *KernelName, + KernelArgsTy *KernelArgs); + // Non-blocking synchronization for target nowait regions. This function // acquires the asynchronous context from task data of the current task being // executed and tries to query for the completion of its operations. If the diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 604683370cd27d..5d135795170563 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2016,20 +2016,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Plugin::success(); } - virtual Error callGlobalConstructors(GenericPluginTy &Plugin, - DeviceImageTy &Image) override { - GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); - if (Handler.isSymbolInImage(*this, Image, "amdgcn.device.fini")) - Image.setPendingGlobalDtors(); - - return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true); + virtual Expected<StringRef> + getGlobalConstructorName(DeviceImageTy &Image) override { + return "amdgcn.device.init"; } - - virtual Error callGlobalDestructors(GenericPluginTy &Plugin, - DeviceImageTy &Image) override { - if (Image.hasPendingGlobalDtors()) - return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false); - return Plugin::success(); + virtual Expected<StringRef> + getGlobalDestructorName(DeviceImageTy &Image) override { + return "amdgcn.device.fini"; } uint64_t getStreamBusyWaitMicroseconds() const { return OMPX_StreamBusyWait; } @@ -2107,13 +2100,14 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { uint64_t getClockFrequency() const override { return ClockFrequency; } /// Allocate and construct an AMDGPU kernel. - Expected<GenericKernelTy &> constructKernel(const char *Name) override { + Expected<GenericKernelTy &> + constructKernelImpl(llvm::StringRef Name) override { // Allocate and construct the AMDGPU kernel. AMDGPUKernelTy *AMDGPUKernel = Plugin.allocate<AMDGPUKernelTy>(); if (!AMDGPUKernel) return Plugin::error("Failed to allocate memory for AMDGPU kernel"); - new (AMDGPUKernel) AMDGPUKernelTy(Name); + new (AMDGPUKernel) AMDGPUKernelTy(Name.data()); return *AMDGPUKernel; } @@ -2791,38 +2785,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>; using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>; - /// Common method to invoke a single threaded constructor or destructor - /// kernel by name. - Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image, - bool IsCtor) { - const char *KernelName = - IsCtor ? "amdgcn.device.init" : "amdgcn.device.fini"; - // Perform a quick check for the named kernel in the image. The kernel - // should be created by the 'amdgpu-lower-ctor-dtor' pass. - GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); - if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName)) - return Plugin::success(); - - // Allocate and construct the AMDGPU kernel. - AMDGPUKernelTy AMDGPUKernel(KernelName); - if (auto Err = AMDGPUKernel.init(*this, Image)) - return Err; - - AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr); - - KernelArgsTy KernelArgs = {}; - if (auto Err = - AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u, - /*NumBlocks=*/1ul, KernelArgs, - KernelLaunchParamsTy{}, AsyncInfoWrapper)) - return Err; - - Error Err = Plugin::success(); - AsyncInfoWrapper.finalize(Err); - - return Err; - } - /// Detect if current architecture is an APU. Error checkIfAPU() { // TODO: replace with ROCr API once it becomes available. diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 81823338fe2112..08bdb0d75a464f 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -722,18 +722,17 @@ struct GenericDeviceTy : public DeviceAllocatorTy { Error synchronize(__tgt_async_info *AsyncInfo); virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo) = 0; - /// Invokes any global constructors on the device if present and is required - /// by the target. - virtual Error callGlobalConstructors(GenericPluginTy &Plugin, - DeviceImageTy &Image) { - return Error::success(); + /// Call the ctor/dtor of image \p Image, if available. + Error callGlobalCtorDtor(DeviceImageTy &Image, bool IsCtor); + + /// Return the name of the global constructors on the device. + virtual Expected<StringRef> getGlobalConstructorName(DeviceImageTy &Image) { + return ""; } - /// Invokes any global destructors on the device if present and is required - /// by the target. - virtual Error callGlobalDestructors(GenericPluginTy &Plugin, - DeviceImageTy &Image) { - return Error::success(); + /// Return the name of the global destructors on the device. + virtual Expected<StringRef> getGlobalDestructorName(DeviceImageTy &Image) { + return ""; } /// Query for the completion of the pending operations on the __tgt_async_info @@ -928,8 +927,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy { bool useAutoZeroCopy(); virtual bool useAutoZeroCopyImpl() { return false; } - /// Allocate and construct a kernel object. - virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0; + /// Retrieve the kernel with name \p Name from image \p Image (or any image if + /// \p Image is null) and return it. If \p Optional is true, the function + /// returns success if there is no kernel with the given name. + Expected<GenericKernelTy *> getKernel(llvm::StringRef Name, + DeviceImageTy *Image = nullptr, + bool Optional = false); /// Reference to the underlying plugin that created this device. GenericPluginTy &Plugin; @@ -947,6 +950,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy { UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0); private: + /// Allocate and construct a kernel object (users should use getKernel). + virtual Expected<GenericKernelTy &> + constructKernelImpl(llvm::StringRef Name) = 0; + /// Get and set the stack size and heap size for the device. If not used, the /// plugin can implement the setters as no-op and setting the output /// value to zero for the getters. @@ -1046,6 +1053,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy { private: DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0}; DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0}; + + DenseMap<StringRef, GenericKernelTy *> KernelMap; }; /// Class implementing common functionalities of offload plugins. Each plugin diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 84d946507ea74a..ac7709f4229bc3 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -809,7 +809,7 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) { Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { for (DeviceImageTy *Image : LoadedImages) - if (auto Err = callGlobalDestructors(Plugin, *Image)) + if (auto Err = callGlobalCtorDtor(*Image, /*Ctor*/ false)) return Err; if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) { @@ -866,6 +866,37 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { return deinitImpl(); } + +Error GenericDeviceTy::callGlobalCtorDtor(DeviceImageTy &Image, bool IsCtor) { + auto NameOrErr = + IsCtor ? getGlobalConstructorName(Image) : getGlobalDestructorName(Image); + if (auto Err = NameOrErr.takeError()) + return Err; + // No error but no name, that means there is no ctor/dtor. + if (NameOrErr->empty()) + return Plugin::success(); + + auto KernelOrErr = getKernel(*NameOrErr, &Image, /*Optional=*/true); + if (auto Err = KernelOrErr.takeError()) + return Err; + + if (GenericKernelTy *Kernel = *KernelOrErr) { + KernelArgsTy KernelArgs; + KernelArgs.NumTeams[0] = KernelArgs.ThreadLimit[0] = 1; + AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr); + if (auto Err = Kernel->launch(*this, /*ArgPtrs=*/nullptr, + /*ArgOffsets=*/nullptr, KernelArgs, + AsyncInfoWrapper)) + return Err; + + Error Err = Plugin::success(); + AsyncInfoWrapper.finalize(Err); + return Err; + } + + return Plugin::success(); +} + Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, const __tgt_device_image *InputTgtImage) { @@ -927,8 +958,8 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, #endif // Call any global constructors present on the device. - if (auto Err = callGlobalConstructors(Plugin, *Image)) - return std::move(Err); + if (auto Err = callGlobalCtorDtor(*Image, /*Ctor*/ true)) + return Err; // Return the pointer to the table of entries. return Image; @@ -1533,6 +1564,66 @@ Error GenericDeviceTy::printInfo() { return Plugin::success(); } +Expected<GenericKernelTy *> GenericDeviceTy::getKernel(llvm::StringRef Name, + DeviceImageTy *ImagePtr, + bool Optional) { + bool KernelFound = false; + GenericKernelTy *&KernelPtr = KernelMap[Name]; + if (!KernelPtr) { + GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); + + auto CheckImage = [&](DeviceImageTy &Image) -> GenericKernelTy * { + if (!GHandler.isSymbolInImage(*this, Image, Name)) + return nullptr; + + auto KernelOrErr = constructKernelImpl(Name); + if (Error Err = KernelOrErr.takeError()) { + [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); + DP("Failed to construct kernel ('%s'): %s", Name.data(), + ErrStr.c_str()); + return nullptr; + } + + GenericKernelTy &Kernel = *KernelOrErr; + if (auto Err = Kernel.init(*this, Image)) { + [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); + DP("Failed to initialize kernel ('%s'): %s", Name.data(), + ErrStr.c_str()); + return nullptr; + } + + return &Kernel; + }; + + if (ImagePtr) { + KernelPtr = CheckImage(*ImagePtr); + } else { + for (DeviceImageTy *Image : LoadedImages) { + KernelPtr = CheckImage(*Image); + if (KernelPtr) + break; + } + } + } + + // If we didn't find the kernel and it was optional, we do not emit an error. + if (!KernelPtr && !KernelFound && Optional) + return nullptr; + // If we didn't find the kernel and it was not optional, we will emit an + // error. + if (!KernelPtr && !KernelFound) + return Plugin::error("Kernel '%s' not found, searched %zu images", + Name.data(), + ImagePtr ? size_t(1) : LoadedImages.size()); + // If we found the kernel but couldn't initialize it, we will emit an error. + if (!KernelPtr) + return Plugin::error( + "Kernel '%s' failed to initialize, searched %zu images", Name.data(), + ImagePtr ? size_t(1) : LoadedImages.size()); + // Found the kernel and initialized it. + return KernelPtr; +} + Error GenericDeviceTy::createEvent(void **EventPtrStorage) { return createEventImpl(EventPtrStorage); } @@ -2147,20 +2238,14 @@ int32_t GenericPluginTy::get_function(__tgt_device_binary Binary, GenericDeviceTy &Device = Image.getDevice(); - auto KernelOrErr = Device.constructKernel(Name); + auto KernelOrErr = Device.getKernel(Name, &Image); if (Error Err = KernelOrErr.takeError()) { REPORT("Failure to look up kernel: %s\n", toString(std::move(Err)).data()); return OFFLOAD_FAIL; } - GenericKernelTy &Kernel = *KernelOrErr; - if (auto Err = Kernel.init(Device, Image)) { - REPORT("Failure to init kernel: %s\n", toString(std::move(Err)).data()); - return OFFLOAD_FAIL; - } - // Note that this is not the kernel's device address. - *KernelPtr = &Kernel; + *KernelPtr = *KernelOrErr; return OFFLOAD_SUCCESS; } diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index b6465d61bd033f..2a7b6f844cca05 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -393,22 +393,17 @@ struct CUDADeviceTy : public GenericDeviceTy { return Plugin::success(); } - virtual Error callGlobalConstructors(GenericPluginTy &Plugin, - DeviceImageTy &Image) override { - // Check for the presense of global destructors at initialization time. This - // is required when the image may be deallocated before destructors are run. - GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); - if (Handler.isSymbolInImage(*this, Image, "nvptx$device$fini")) - Image.setPendingGlobalDtors(); - - return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true); + virtual Expected<StringRef> + getGlobalConstructorName(DeviceImageTy &Image) override { + if (auto Err = prepareGlobalCtorDtorCommon(Image, /*IsCtor=*/true)) + return Err; + return "nvptx$device$init"; } - - virtual Error callGlobalDestructors(GenericPluginTy &Plugin, - DeviceImageTy &Image) override { - if (Image.hasPendingGlobalDtors()) - return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false); - return Plugin::success(); + virtual Expected<StringRef> + getGlobalDestructorName(DeviceImageTy &Image) override { + if (auto Err = prepareGlobalCtorDtorCommon(Image, /*IsCtor=*/false)) + return Err; + return "nvptx$device$fini"; } Expected<std::unique_ptr<MemoryBuffer>> @@ -471,13 +466,14 @@ struct CUDADeviceTy : public GenericDeviceTy { } /// Allocate and construct a CUDA kernel. - Expected<GenericKernelTy &> constructKernel(const char *Name) override { + Expected<GenericKernelTy &> + constructKernelImpl(llvm::StringRef Name) override { // Allocate and construct the CUDA kernel. CUDAKernelTy *CUDAKernel = Plugin.allocate<CUDAKernelTy>(); if (!CUDAKernel) return Plugin::error("Failed to allocate memory for CUDA kernel"); - new (CUDAKernel) CUDAKernelTy(Name); + new (CUDAKernel) CUDAKernelTy(Name.data()); return *CUDAKernel; } @@ -1149,8 +1145,7 @@ struct CUDADeviceTy : public GenericDeviceTy { using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>; using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>; - Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image, - bool IsCtor) { + Error prepareGlobalCtorDtorCommon(DeviceImageTy &Image, bool IsCtor) { const char *KernelName = IsCtor ? "nvptx$device$init" : "nvptx$device$fini"; // Perform a quick check for the named kernel in the image. The kernel // should be created by the 'nvptx-lower-ctor-dtor' pass. diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index fe296b77c7d557..604b2648b6d629 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -151,13 +151,14 @@ struct GenELF64DeviceTy : public GenericDeviceTy { std::string getComputeUnitKind() const override { return "generic-64bit"; } /// Construct the kernel for a specific image on the device. - Expected<GenericKernelTy &> constructKernel(const char *Name) override { + Expected<GenericKernelTy &> + constructKernelImpl(llvm::StringRef Name) override { // Allocate and construct the kernel. GenELF64KernelTy *GenELF64Kernel = Plugin.allocate<GenELF64KernelTy>(); if (!GenELF64Kernel) return Plugin::error("Failed to allocate memory for GenELF64 kernel"); - new (GenELF64Kernel) GenELF64KernelTy(Name); + new (GenELF64Kernel) GenELF64KernelTy(Name.data()); return *GenELF64Kernel; } diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt index c5f5d902fad14c..75c3ea68eed107 100644 --- a/offload/src/CMakeLists.txt +++ b/offload/src/CMakeLists.txt @@ -62,6 +62,23 @@ endforeach() target_compile_options(omptarget PRIVATE ${offload_compile_flags}) target_link_options(omptarget PRIVATE ${offload_link_flags}) +add_llvm_library(offload.kernels + STATIC + + Kernels/Memory.cpp + + LINK_LIBS + PUBLIC + omptarget.devicertl + + NO_INSTALL_RPATH + BUILDTREE_ONLY +) + +list(JOIN LIBOMPTARGET_DEVICE_ARCHITECTURES "," KERNEL_OFFLOAD_ARCHS) +target_compile_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -foffload-lto -foffload-via-llvm ) +target_link_options(offload.kernels PRIVATE -x cuda --offload-arch=${KERNEL_OFFLOAD_ARCHS} -nocudalib -nogpulib -foffload-lto -foffload-via-llvm ) + # libomptarget.so needs to be aware of where the plugins live as they # are now separated in the build directory. set_target_properties(omptarget PROPERTIES @@ -69,3 +86,4 @@ set_target_properties(omptarget PROPERTIES INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..") install(TARGETS omptarget LIBRARY COMPONENT omptarget DESTINATION "${OFFLOAD_INSTALL_LIBDIR}") +install(TARGETS offload.kernels LIBRARY COMPONENT offload.kernels DESTINATION "${OFFLOAD_INSTALL_LIBDIR}") diff --git a/offload/src/Kernels/Memory.cpp b/offload/src/Kernels/Memory.cpp new file mode 100644 index 00000000000000..94777872106b05 --- /dev/null +++ b/offload/src/Kernels/Memory.cpp @@ -0,0 +1,53 @@ +//===-- Kenrels/Memory.cpp - Memory related kernel definitions ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include <cstdint> + +#define LAUNCH_BOUNDS(MIN, MAX) \ + __attribute__((launch_bounds(MAX), amdgpu_flat_work_group_size(MIN, MAX))) +#define INLINE [[clang::always_inline]] inline +#define KERNEL [[gnu::weak]] __global__ +#define DEVICE __device__ + +extern "C" { +DEVICE int ompx_thread_id(int Dim); +DEVICE int ompx_block_id(int Dim); +DEVICE int ompx_block_dim(int Dim); +DEVICE int ompx_grid_dim(int Dim); +} + +namespace { +INLINE +DEVICE void __memset_impl(char *Ptr, int ByteVal, size_t NumBytes) { + int TId = ompx_thread_id(0); + int BId = ompx_block_id(0); + int BDim = ompx_block_dim(0); + size_t GId = BId * BDim + TId; + if (GId < NumBytes) + Ptr[GId] = ByteVal; +} +} // namespace + +extern "C" { +KERNEL void LAUNCH_BOUNDS(1, 256) + __memset(char *Ptr, int ByteVal, size_t NumBytes) { + __memset_impl(Ptr, ByteVal, NumBytes); +} + +KERNEL void LAUNCH_BOUNDS(1, 256) + __memset_zero(char *Ptr, int ByteVal, size_t NumBytes) { + __memset_impl(Ptr, 0, NumBytes); +} + +KERNEL void LAUNCH_BOUNDS(1, 256) + __memset_ones(char *Ptr, int ByteVal, size_t NumBytes) { + __memset_impl(Ptr, ~0, NumBytes); +} +} diff --git a/offload/src/OpenMP/API.cpp b/offload/src/OpenMP/API.cpp index e59bdba8abf0e4..210cadab25edee 100644 --- a/offload/src/OpenMP/API.cpp +++ b/offload/src/OpenMP/API.cpp @@ -392,25 +392,52 @@ EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes, DP("filling memory on host via memset"); memset(Ptr, ByteVal, NumBytes); // ignore return value, memset() cannot fail } else { - // TODO: replace the omp_target_memset() slow path with the fast path. - // That will require the ability to execute a kernel from within - // libomptarget.so (which we do not have at the moment). - - // This is a very slow path: create a filled array on the host and upload - // it to the GPU device. - int InitialDevice = omp_get_initial_device(); - void *Shadow = omp_target_alloc(NumBytes, InitialDevice); - if (Shadow) { - (void)memset(Shadow, ByteVal, NumBytes); - (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum, - InitialDevice); - (void)omp_target_free(Shadow, InitialDevice); - } else { - // If the omp_target_alloc has failed, let's just not do anything. - // omp_target_memset does not have any good way to fail, so we - // simply avoid a catastrophic failure of the process for now. - DP("omp_target_memset failed to fill memory due to error with " - "omp_target_alloc"); + struct LaunchArgsTy { + void *Ptr; + int ByteVal; + size_t NumBytes; + } LaunchArgs{Ptr, ByteVal, NumBytes}; + + auto NumThreads = NumBytes > 256 ? 256 : NumBytes; + auto NumBlocks = (NumBytes + 255) / 256; + const char *KernelName = "__memset"; + switch (ByteVal) { + case 0: + KernelName = "__memset_zero"; + break; + case ~0: + KernelName = "__memset_ones"; + break; + default: + break; + }; + // Try to launch the __memset kernel first. + KernelArgsTy KernelArgs; + KernelArgs.NumTeams[0] = NumBlocks; + KernelArgs.ThreadLimit[0] = NumThreads; + struct { + size_t LaunchArgsSize; + void *LaunchArgs; + } WrappedLaunchArgs = {sizeof(LaunchArgs), &LaunchArgs}; + KernelArgs.ArgPtrs = reinterpret_cast<void **>(&WrappedLaunchArgs); + KernelArgs.Flags.IsCUDA = true; + if (__tgt_launch_by_name(nullptr, DeviceNum, KernelName, &KernelArgs)) { + // This is a very slow path: create a filled array on the host and upload + // it to the GPU device. + int InitialDevice = omp_get_initial_device(); + void *Shadow = omp_target_alloc(NumBytes, InitialDevice); + if (Shadow) { + (void)memset(Shadow, ByteVal, NumBytes); + (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum, + InitialDevice); + (void)omp_target_free(Shadow, InitialDevice); + } else { + // If the omp_target_alloc has failed, let's just not do anything. + // omp_target_memset does not have any good way to fail, so we + // simply avoid a catastrophic failure of the process for now. + DP("omp_target_memset failed to fill memory due to error with " + "omp_target_alloc"); + } } } diff --git a/offload/src/device.cpp b/offload/src/device.cpp index 943c7782787306..9b782009d08cd0 100644 --- a/offload/src/device.cpp +++ b/offload/src/device.cpp @@ -226,6 +226,17 @@ bool DeviceTy::printDeviceInfo() { return true; } +int32_t DeviceTy::getKernelHandle(llvm::StringRef Name, void **HandlePtr) { + auto KernelOrErr = RTL->getDevice(RTLDeviceID).getKernel(Name); + if (!KernelOrErr) { + [[maybe_unused]] auto ErrStr = toString(KernelOrErr.takeError()); + DP("%s\n", ErrStr.c_str()); + return OFFLOAD_FAIL; + } + *HandlePtr = *KernelOrErr; + return OFFLOAD_SUCCESS; +} + // Whether data can be copied to DstDevice directly bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) { if (RTL != DstDevice.RTL) diff --git a/offload/src/exports b/offload/src/exports index 7bdc7d2a531bb3..b7671dd1421bd6 100644 --- a/offload/src/exports +++ b/offload/src/exports @@ -27,6 +27,7 @@ VERS1.0 { __tgt_target_nowait_mapper; __tgt_target_teams_nowait_mapper; __tgt_target_kernel; + __tgt_launch_by_name; __tgt_target_kernel_nowait; __tgt_target_nowait_query; __tgt_target_kernel_replay; diff --git a/offload/src/interface.cpp b/offload/src/interface.cpp index 21f9114ac2b088..dad643187fba26 100644 --- a/offload/src/interface.cpp +++ b/offload/src/interface.cpp @@ -394,6 +394,21 @@ EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, HostPtr, KernelArgs); } +EXTERN int __tgt_launch_by_name(ident_t *Loc, int64_t DeviceId, + const char *KernelName, + KernelArgsTy *KernelArgs) { + auto DeviceOrErr = PM->getDevice(DeviceId); + if (!DeviceOrErr) + FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str()); + auto &Device = *DeviceOrErr; + void *Handle; + if (Device.getKernelHandle(KernelName, &Handle)) + return OFFLOAD_FAIL; + AsyncInfoTy AsyncInfo(*DeviceOrErr); + return DeviceOrErr->launchKernel(Handle, nullptr, nullptr, *KernelArgs, + AsyncInfo); +} + /// Activates the record replay mechanism. /// \param DeviceId The device identifier to execute the target region. /// \param MemorySize The number of bytes to be (pre-)allocated diff --git a/offload/test/jit/type_punning.c b/offload/test/jit/type_punning.c index 574168b8a69cbb..c2cd415a5fc75f 100644 --- a/offload/test/jit/type_punning.c +++ b/offload/test/jit/type_punning.c @@ -13,8 +13,8 @@ // Ensure that there is only the kernel function left, not any outlined // parallel regions. // -// CHECK: define -// CHECK-NOT: define +// CHECK: define {{.*}}__omp_offloading_ +// CHECK-NOT: call {{.*}}@__ #include <omp.h> #include <stdio.h> diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index b4fc7d3b333b35..907300096f1665 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -179,7 +179,10 @@ def remove_suffix_if_present(name): return name def add_libraries(source): - return source + " " + config.llvm_library_intdir + "/libomptarget.devicertl.a" + source += " " + config.llvm_library_intdir + "/libomptarget.devicertl.a" + source += " " + config.llvm_library_intdir + "/liboffload.kernels.a" + return source + # Add platform targets host_targets = [ diff --git a/offload/test/offloading/kernels_memset.c b/offload/test/offloading/kernels_memset.c new file mode 100644 index 00000000000000..4cdbd56c366a88 --- /dev/null +++ b/offload/test/offloading/kernels_memset.c @@ -0,0 +1,61 @@ +// RUN: %libomptarget-compile-generic +// RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-generic 2>&1 | \ +// RUN: %fcheck-generic +// +// REQUIRES: gpu + +#include <omp.h> + +int main(int argc, char *argv[]) { + const int num_blocks = 64; + const int block_size = 256; + const int N = num_blocks * block_size; + int *data = + (int *)omp_target_alloc(N * sizeof(int), omp_get_default_device()); + + // clang-format off + // CHECK: Launching kernel __memset_zero with 256 blocks and 256 threads in SPMD mode + // CHECK: Launching kernel __omp_offloading{{.*}} with 64 blocks and 256 threads in SPMD mode + omp_target_memset(data, '\0', N * sizeof(int), omp_get_default_device()); + // clang-format on + +#pragma omp target teams num_teams(num_blocks) thread_limit(block_size) + { +#pragma omp parallel + if (data[omp_get_team_num() * omp_get_num_threads() + + omp_get_thread_num()] != 0) + __builtin_trap(); + } + + // clang-format off + // CHECK: Launching kernel __memset_ones with 256 blocks and 256 threads in SPMD mode + // CHECK: Launching kernel __omp_offloading{{.*}} with 64 blocks and 256 threads in SPMD mode + omp_target_memset(data, ~0, N * sizeof(int), omp_get_default_device()); + // clang-format on + +#pragma omp target teams num_teams(num_blocks) thread_limit(block_size) + { +#pragma omp parallel + if (data[omp_get_team_num() * omp_get_num_threads() + + omp_get_thread_num()] != ~0) + __builtin_trap(); + } + + // clang-format off + // CHECK: Launching kernel __memset with 256 blocks and 256 threads in SPMD mode + // CHECK: Launching kernel __omp_offloading{{.*}} with 256 blocks and 256 threads in SPMD mode + omp_target_memset(data, '$', N * sizeof(int), omp_get_default_device()); + // clang-format on + + char *cdata = (char *)data; +#pragma omp target teams num_teams(num_blocks * sizeof(int)) \ + thread_limit(block_size) + { +#pragma omp parallel + if (cdata[omp_get_team_num() * omp_get_num_threads() + + omp_get_thread_num()] != '$') + __builtin_trap(); + } + + return 0; +} _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits