atmnpatel updated this revision to Diff 398370. atmnpatel added a comment. - Fixed lifetime issue around ffi_call - Addressed comments
The existing x86 plugin uses ffi, so this does as well, no explicit benefit in doing so. Is it worth keeping? Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D113359/new/ https://reviews.llvm.org/D113359 Files: clang/lib/Basic/TargetInfo.cpp clang/lib/Basic/Targets/X86.h clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp clang/lib/CodeGen/CodeGenModule.cpp clang/lib/Driver/ToolChains/Gnu.cpp clang/lib/Frontend/CompilerInvocation.cpp llvm/include/llvm/ADT/Triple.h llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h llvm/lib/Support/Triple.cpp openmp/CMakeLists.txt openmp/libomptarget/DeviceRTL/CMakeLists.txt openmp/libomptarget/DeviceRTL/include/ThreadEnvironment.h openmp/libomptarget/DeviceRTL/src/Debug.cpp openmp/libomptarget/DeviceRTL/src/Kernel.cpp openmp/libomptarget/DeviceRTL/src/Mapping.cpp openmp/libomptarget/DeviceRTL/src/Misc.cpp openmp/libomptarget/DeviceRTL/src/Synchronization.cpp openmp/libomptarget/DeviceRTL/src/Utils.cpp openmp/libomptarget/plugins/CMakeLists.txt openmp/libomptarget/plugins/vgpu/CMakeLists.txt openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.cpp openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.h openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.h openmp/libomptarget/plugins/vgpu/src/rtl.cpp openmp/libomptarget/src/rtl.cpp openmp/libomptarget/test/lit.cfg
Index: openmp/libomptarget/test/lit.cfg =================================================================== --- openmp/libomptarget/test/lit.cfg +++ openmp/libomptarget/test/lit.cfg @@ -114,9 +114,11 @@ # Scan all the valid targets. for libomptarget_target in config.libomptarget_all_targets: + print("Checking {}".format(libomptarget_target)) # Is this target in the current system? If so create a compile, run and test # command. Otherwise create command that return false. if libomptarget_target == config.libomptarget_current_target: + print("First") config.substitutions.append(("%libomptarget-compilexx-run-and-check-generic", "%libomptarget-compilexx-run-and-check-" + libomptarget_target)) config.substitutions.append(("%libomptarget-compile-run-and-check-generic", @@ -176,6 +178,7 @@ config.substitutions.append(("%fcheck-" + libomptarget_target, \ config.libomptarget_filecheck + " %s")) else: + print("Second") config.substitutions.append(("%libomptarget-compile-run-and-check-" + \ libomptarget_target, \ "echo ignored-command")) Index: openmp/libomptarget/src/rtl.cpp =================================================================== --- openmp/libomptarget/src/rtl.cpp +++ openmp/libomptarget/src/rtl.cpp @@ -24,12 +24,13 @@ // List of all plugins that can support offloading. static const char *RTLNames[] = { /* PowerPC target */ "libomptarget.rtl.ppc64.so", - /* x86_64 target */ "libomptarget.rtl.x86_64.so", + /* x86_64 target "libomptarget.rtl.x86_64.so", */ /* CUDA target */ "libomptarget.rtl.cuda.so", /* AArch64 target */ "libomptarget.rtl.aarch64.so", /* SX-Aurora VE target */ "libomptarget.rtl.ve.so", /* AMDGPU target */ "libomptarget.rtl.amdgpu.so", /* Remote target */ "libomptarget.rtl.rpc.so", + /* Virtual GPU target */ "libomptarget.rtl.vgpu.so", }; PluginManager *PM; @@ -79,7 +80,13 @@ // is correct and if they are supporting any devices. for (auto *Name : RTLNames) { DP("Loading library '%s'...\n", Name); - void *dynlib_handle = dlopen(Name, RTLD_NOW); + + int Flags = RTLD_NOW; + + if (strcmp(Name, "libomptarget.rtl.vgpu.so") == 0) + Flags |= RTLD_GLOBAL; + + void *dynlib_handle = dlopen(Name, Flags); if (!dynlib_handle) { // Library does not exist or cannot be found. Index: openmp/libomptarget/plugins/vgpu/src/rtl.cpp =================================================================== --- /dev/null +++ openmp/libomptarget/plugins/vgpu/src/rtl.cpp @@ -0,0 +1,609 @@ +//===------RTLs/vgpu/src/rtl.cpp - Target RTLs Implementation ----- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// RTL for virtual (x86) GPU +// +//===----------------------------------------------------------------------===// + +#include <barrier> +#include <cassert> +#include <cmath> +#include <condition_variable> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <dlfcn.h> +#include <ffi.h> +#include <functional> +#include <gelf.h> +#include <link.h> +#include <list> +#include <memory> +#include <mutex> +#include <queue> +#include <thread> +#include <vector> + +#include "Debug.h" +#include "ThreadEnvironment.h" +#include "ThreadEnvironmentImpl.h" +#include "omptarget.h" +#include "omptargetplugin.h" + +#ifndef TARGET_NAME +#define TARGET_NAME Generic ELF - 64bit +#endif +#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL" + +#ifndef TARGET_ELF_ID +#define TARGET_ELF_ID 0 +#endif + +#include "elf_common.h" + +#define OFFLOADSECTIONNAME "omp_offloading_entries" + +#define DEBUG false + +struct FFICallTy { + ffi_cif CIF; + std::vector<ffi_type *> ArgsTypes; + std::vector<void *> Args; + std::vector<void *> Ptrs; + void (*Entry)(void); + + FFICallTy(int32_t ArgNum, void **TgtArgs, ptrdiff_t *TgtOffsets, + void *TgtEntryPtr) + : ArgsTypes(ArgNum, &ffi_type_pointer), Args(ArgNum), Ptrs(ArgNum) { + for (int32_t i = 0; i < ArgNum; ++i) { + Ptrs[i] = (void *)((intptr_t)TgtArgs[i] + TgtOffsets[i]); + Args[i] = &Ptrs[i]; + } + + ffi_status status = ffi_prep_cif(&CIF, FFI_DEFAULT_ABI, ArgNum, + &ffi_type_void, &ArgsTypes[0]); + + assert(status == FFI_OK && "Unable to prepare target launch!"); + + *((void **)&Entry) = TgtEntryPtr; + } +}; + +/// Array of Dynamic libraries loaded for this target. +struct DynLibTy { + char *FileName; + void *Handle; +}; + +/// Keep entries table per device. +struct FuncOrGblEntryTy { + __tgt_target_table Table; +}; + +thread_local ThreadEnvironmentTy *ThreadEnvironment; + +/// Class containing all the device information. +class RTLDeviceInfoTy { + std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries; + +public: + std::list<DynLibTy> DynLibs; + + // Record entry point associated with device. + void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin, + __tgt_offload_entry *end) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncGblEntries[device_id].emplace_back(); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + E.Table.EntriesBegin = begin; + E.Table.EntriesEnd = end; + } + + // Return true if the entry is associated with device. + bool findOffloadEntry(int32_t device_id, void *addr) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + for (__tgt_offload_entry *i = E.Table.EntriesBegin, *e = E.Table.EntriesEnd; + i < e; ++i) { + if (i->addr == addr) + return true; + } + + return false; + } + + // Return the pointer to the target entries table. + __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + return &E.Table; + } + + RTLDeviceInfoTy() : FuncGblEntries(1) { } + + ~RTLDeviceInfoTy() { + // Close dynamic libraries + for (auto &lib : DynLibs) { + if (lib.Handle) { + dlclose(lib.Handle); + remove(lib.FileName); + } + } + } +}; + +static RTLDeviceInfoTy DeviceInfo; + +std::vector<CTAEnvironmentTy *> CTAEnvironments; +std::vector<WarpEnvironmentTy *> WarpEnvironments; + +struct VGPUTy { + struct KernelTy { + FFICallTy *Call; + int NumTeams; + + KernelTy(FFICallTy *Call, int NumTeams) : Call(Call), NumTeams(NumTeams) {} + }; + + struct VGPUStreamTy { + std::queue<KernelTy> Kernels; + std::mutex Mtx; + + void emplace(FFICallTy *Call, int NumTeams) { + std::lock_guard Guard(Mtx); + Kernels.emplace(Call, NumTeams); + } + + KernelTy front() { + std::lock_guard Guard(Mtx); + return Kernels.front(); + } + + void pop() { + std::lock_guard Guard(Mtx); + Kernels.pop(); + } + + bool empty() { + std::lock_guard Guard(Mtx); + return Kernels.empty(); + } + }; + + struct AsyncInfoQueueTy { + std::deque<__tgt_async_info *> Streams; + std::mutex Mtx; + + bool empty() { + std::lock_guard Guard(Mtx); + return Streams.empty(); + } + + __tgt_async_info *front() { + std::lock_guard Guard(Mtx); + return Streams.front(); + } + + void pop() { + std::lock_guard Guard(Mtx); + Streams.pop_front(); + } + + void emplace(__tgt_async_info *AsyncInfo) { + std::lock_guard Guard(Mtx); + Streams.emplace_back(AsyncInfo); + } + } ExecutionQueue; + + VGPUStreamTy *getStream(__tgt_async_info *AsyncInfo) { + assert(AsyncInfo != nullptr && "async_info ptr was null"); + + if (!AsyncInfo->Queue) + AsyncInfo->Queue = new VGPUStreamTy(); + + return reinterpret_cast<VGPUStreamTy *>(AsyncInfo->Queue); + } + + std::atomic<bool> Running; + std::vector<std::thread> Threads; + int WarpsPerCTA; + int NumCTAs; + + std::unique_ptr<std::barrier<std::function<void(void)>>> Barrier; + std::condition_variable WorkAvailable; + std::mutex WorkDoneMtx; + std::condition_variable WorkDone; + + VGPUTy(int NumThreads = -1, int ThreadsPerWarp = -1, int WarpsPerCTA = -1) + : Running(true) { + if (const char *Env = std::getenv("VGPU_NUM_THREADS")) + NumThreads = std::stoi(Env); + if (const char *Env = std::getenv("VGPU_THREADS_PER_WARP")) + ThreadsPerWarp = std::stoi(Env); + if (const char *Env = std::getenv("VGPU_WARPS_PER_CTA")) + WarpsPerCTA = std::stoi(Env); + + if (NumThreads == -1) + NumThreads = std::thread::hardware_concurrency(); + if (ThreadsPerWarp == -1) + ThreadsPerWarp = NumThreads; + if (WarpsPerCTA == -1) + WarpsPerCTA = 1; + + NumCTAs = NumThreads / (ThreadsPerWarp * WarpsPerCTA); + + printf("NumThreads: %d, ThreadsPerWarp: %d, WarpsPerCTA: %d\n", NumThreads, + ThreadsPerWarp, WarpsPerCTA); + + assert(NumThreads % ThreadsPerWarp == 0 && NumThreads % WarpsPerCTA == 0 && + "Invalid VGPU Config"); + + Barrier = std::make_unique<std::barrier<std::function<void(void)>>>( + NumThreads, []() {}); + + Threads.reserve(NumThreads); + + auto GlobalThreadIdx = 0; + for (auto CTAIdx = 0; CTAIdx < NumCTAs; CTAIdx++) { + auto *CTAEnv = + new CTAEnvironmentTy(CTAIdx, NumThreads / NumCTAs, NumCTAs); + for (auto WarpIdx = 0; WarpIdx < WarpsPerCTA; WarpIdx++) { + auto *WarpEnv = new WarpEnvironmentTy(WarpIdx, ThreadsPerWarp); + for (auto ThreadIdx = 0; ThreadIdx < ThreadsPerWarp; ThreadIdx++) { + Threads.emplace_back([this, ThreadIdx, GlobalThreadIdx, CTAEnv, + WarpEnv]() { + ThreadEnvironment = + new ThreadEnvironmentTy(ThreadIdx, WarpEnv, CTAEnv); + while (Running) { + { + std::unique_lock<std::mutex> UniqueLock(ExecutionQueue.Mtx); + + WorkAvailable.wait(UniqueLock, [&]() { + if (!Running) { + return true; + } + bool IsEmpty = ExecutionQueue.Streams.empty(); + + return !IsEmpty; + }); + } + + if (ExecutionQueue.empty()) { + continue; + } + + while (!ExecutionQueue.empty()) { + auto *Stream = getStream(ExecutionQueue.front()); + while (!Stream->empty()) { + auto KernelInfo = Stream->front(); + + const unsigned NumTeams = KernelInfo.NumTeams; + unsigned TeamIdx = 0; + while (TeamIdx < KernelInfo.NumTeams) { + if (CTAEnv->getId() < KernelInfo.NumTeams) { + ThreadEnvironment->setBlockEnv( + new ThreadBlockEnvironmentTy( + TeamIdx + CTAEnv->getId(), NumTeams)); + ffi_call(&KernelInfo.Call->CIF, KernelInfo.Call->Entry, NULL, &(KernelInfo.Call->Args)[0]); + ThreadEnvironment->resetBlockEnv(); + } + Barrier->arrive_and_wait(); + TeamIdx += NumCTAs; + } + + if (GlobalThreadIdx == 0) { + Stream->pop(); + delete KernelInfo.Call; + } + + Barrier->arrive_and_wait(); + } + if (GlobalThreadIdx == 0) { + ExecutionQueue.pop(); + WorkDone.notify_all(); + } + Barrier->arrive_and_wait(); + } + } + delete ThreadEnvironment; + }); + GlobalThreadIdx = (GlobalThreadIdx + 1) % NumThreads; + } + WarpEnvironments.push_back(WarpEnv); + } + CTAEnvironments.push_back(CTAEnv); + } + } + + ~VGPUTy() { + awaitAll(); + + Running = false; + WorkAvailable.notify_all(); + + for (auto &Thread : Threads) { + if (Thread.joinable()) + Thread.join(); + } + + for (auto *CTAEnv : CTAEnvironments) + delete CTAEnv; + + for (auto *WarpEnv : WarpEnvironments) + delete WarpEnv; + } + + void await(__tgt_async_info *AsyncInfo) { + std::unique_lock UniqueLock(getStream(AsyncInfo)->Mtx); + WorkDone.wait(UniqueLock, + [&]() { return getStream(AsyncInfo)->Kernels.empty(); }); + } + + void awaitAll() { + while (!ExecutionQueue.empty()) { + await(ExecutionQueue.front()); + } + } + + void scheduleAsync(__tgt_async_info *AsyncInfo, FFICallTy *Call, + int NumTeams) { + if (NumTeams == 0) + NumTeams = NumCTAs; + auto *Stream = getStream(AsyncInfo); + Stream->emplace(Call, NumTeams); + ExecutionQueue.emplace(AsyncInfo); + WorkAvailable.notify_all(); + } +}; + +VGPUTy VGPU; + +#ifdef __cplusplus +extern "C" { +#endif + +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { +// If we don't have a valid ELF ID we can just fail. +#if TARGET_ELF_ID < 1 + return 0; +#else + return elf_check_machine(image, TARGET_ELF_ID); +#endif +} + +int32_t __tgt_rtl_number_of_devices() { return 1; } + +int32_t __tgt_rtl_init_device(int32_t device_id) { return OFFLOAD_SUCCESS; } + +__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, + __tgt_device_image *image) { + + DP("Dev %d: load binary from " DPxMOD " image\n", device_id, + DPxPTR(image->ImageStart)); + + assert(device_id >= 0 && device_id < 1 && "bad dev id"); + + size_t ImageSize = (size_t)image->ImageEnd - (size_t)image->ImageStart; + size_t NumEntries = (size_t)(image->EntriesEnd - image->EntriesBegin); + DP("Expecting to have %zd entries defined.\n", NumEntries); + + // Is the library version incompatible with the header file? + if (elf_version(EV_CURRENT) == EV_NONE) { + DP("Incompatible ELF library!\n"); + return NULL; + } + + // Obtain elf handler + Elf *e = elf_memory((char *)image->ImageStart, ImageSize); + if (!e) { + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + return NULL; + } + + if (elf_kind(e) != ELF_K_ELF) { + DP("Invalid Elf kind!\n"); + elf_end(e); + return NULL; + } + + // Find the entries section offset + Elf_Scn *section = 0; + Elf64_Off entries_offset = 0; + + size_t shstrndx; + + if (elf_getshdrstrndx(e, &shstrndx)) { + DP("Unable to get ELF strings index!\n"); + elf_end(e); + return NULL; + } + + while ((section = elf_nextscn(e, section))) { + GElf_Shdr hdr; + gelf_getshdr(section, &hdr); + + if (!strcmp(elf_strptr(e, shstrndx, hdr.sh_name), OFFLOADSECTIONNAME)) { + entries_offset = hdr.sh_addr; + break; + } + } + + if (!entries_offset) { + DP("Entries Section Offset Not Found\n"); + elf_end(e); + return NULL; + } + + DP("Offset of entries section is (" DPxMOD ").\n", DPxPTR(entries_offset)); + + // load dynamic library and get the entry points. We use the dl library + // to do the loading of the library, but we could do it directly to avoid + // the dump to the temporary file. + // + // 1) Create tmp file with the library contents. + // 2) Use dlopen to load the file and dlsym to retrieve the symbols. + char tmp_name[] = "/tmp/tmpfile_XXXXXX"; + int tmp_fd = mkstemp(tmp_name); + + if (tmp_fd == -1) { + elf_end(e); + return NULL; + } + + FILE *ftmp = fdopen(tmp_fd, "wb"); + + if (!ftmp) { + elf_end(e); + return NULL; + } + + fwrite(image->ImageStart, ImageSize, 1, ftmp); + fclose(ftmp); + + DynLibTy Lib = {tmp_name, dlopen(tmp_name, RTLD_NOW | RTLD_GLOBAL)}; + + if (!Lib.Handle) { + DP("Target library loading error: %s\n", dlerror()); + elf_end(e); + return NULL; + } + + DeviceInfo.DynLibs.push_back(Lib); + + struct link_map *libInfo = (struct link_map *)Lib.Handle; + + // The place where the entries info is loaded is the library base address + // plus the offset determined from the ELF file. + Elf64_Addr entries_addr = libInfo->l_addr + entries_offset; + + DP("Pointer to first entry to be loaded is (" DPxMOD ").\n", + DPxPTR(entries_addr)); + + // Table of pointers to all the entries in the target. + __tgt_offload_entry *entries_table = (__tgt_offload_entry *)entries_addr; + + __tgt_offload_entry *entries_begin = &entries_table[0]; + __tgt_offload_entry *entries_end = entries_begin + NumEntries; + + if (!entries_begin) { + DP("Can't obtain entries begin\n"); + elf_end(e); + return NULL; + } + + DP("Entries table range is (" DPxMOD ")->(" DPxMOD ")\n", + DPxPTR(entries_begin), DPxPTR(entries_end)); + DeviceInfo.createOffloadTable(device_id, entries_begin, entries_end); + + elf_end(e); + + return DeviceInfo.getOffloadEntriesTable(device_id); +} + +// Sample implementation of explicit memory allocator. For this plugin all +// kinds are equivalent to each other. +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr, + int32_t kind) { + void *ptr = NULL; + + switch (kind) { + case TARGET_ALLOC_DEVICE: + case TARGET_ALLOC_HOST: + case TARGET_ALLOC_SHARED: + case TARGET_ALLOC_DEFAULT: + ptr = malloc(size); + break; + default: + REPORT("Invalid target data allocation kind"); + } + + return ptr; +} + +int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + VGPU.awaitAll(); + memcpy(tgt_ptr, hst_ptr, size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + VGPU.awaitAll(); + memcpy(hst_ptr, tgt_ptr, size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { + free(tgt_ptr); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info) { + VGPU.await(async_info); + delete (VGPUTy::VGPUStreamTy *)async_info->Queue; + async_info->Queue = nullptr; + return 0; +} + +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, int32_t team_num, + int32_t thread_limit, + uint64_t loop_tripcount) { + __tgt_async_info AsyncInfo; + int rc = __tgt_rtl_run_target_team_region_async( + device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, + thread_limit, loop_tripcount, &AsyncInfo); + + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return __tgt_rtl_synchronize(device_id, &AsyncInfo); +} + +int32_t __tgt_rtl_run_target_team_region_async( + int32_t device_id, void *tgt_entry_ptr, void **tgt_args, + ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, + int32_t thread_limit, uint64_t loop_tripcount /*not used*/, + __tgt_async_info *async_info) { + DP("Running entry point at " DPxMOD "...\n", DPxPTR(tgt_entry_ptr)); + + auto Call = new FFICallTy(arg_num, tgt_args, tgt_offsets, tgt_entry_ptr); + + VGPU.scheduleAsync(async_info, std::move(Call), team_num); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, + int32_t arg_num) { + return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, + tgt_offsets, arg_num, 1, 1, 0); +} + +int32_t __tgt_rtl_run_target_region_async(int32_t device_id, + void *tgt_entry_ptr, void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, + __tgt_async_info *async_info) { + return __tgt_rtl_run_target_team_region_async(device_id, tgt_entry_ptr, + tgt_args, tgt_offsets, arg_num, + 1, 1, 0, async_info); +} + +#ifdef __cplusplus +} +#endif Index: openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.h =================================================================== --- /dev/null +++ openmp/libomptarget/plugins/vgpu/src/ThreadEnvironmentImpl.h @@ -0,0 +1,168 @@ +//===---- ThreadEnvironmentImpl.h - Virtual GPU thread environment - C++ --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENTIMPL_H +#define OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENTIMPL_H + +#include "ThreadEnvironment.h" +#include <barrier> +#include <cstdio> +#include <functional> +#include <map> +#include <thread> +#include <vector> + +class WarpEnvironmentTy { + const unsigned ID; + const unsigned NumThreads; + + std::vector<int32_t> ShuffleBuffer; + + std::barrier<std::function<void(void)>> Barrier; + std::barrier<std::function<void(void)>> ShuffleBarrier; + std::barrier<std::function<void(void)>> ShuffleDownBarrier; + +public: + WarpEnvironmentTy(unsigned ID, unsigned NumThreads) + : ID(ID), NumThreads(NumThreads), ShuffleBuffer(NumThreads), + Barrier(NumThreads, []() {}), ShuffleBarrier(NumThreads, []() {}), + ShuffleDownBarrier(NumThreads, []() {}) {} + + unsigned getWarpId() const { return ID; } + int getNumThreads() const { return NumThreads; } + + void sync() { Barrier.arrive_and_wait(); } + void writeShuffleBuffer(int32_t Var, unsigned LaneId) { + ShuffleBuffer[LaneId] = Var; + } + + int32_t getShuffleBuffer(unsigned LaneId) { return ShuffleBuffer[LaneId]; } + + void waitShuffleBarrier() { ShuffleBarrier.arrive_and_wait(); } + + void waitShuffleDownBarrier() { ShuffleBarrier.arrive_and_wait(); } +}; + +class CTAEnvironmentTy { +public: + unsigned ID; + unsigned NumThreads; + unsigned NumBlocks; + + std::barrier<std::function<void(void)>> Barrier; + std::barrier<std::function<void(void)>> SyncThreads; + std::barrier<std::function<void(void)>> NamedBarrier; + + CTAEnvironmentTy(unsigned ID, unsigned NumThreads, unsigned NumBlocks) + : ID(ID), NumThreads(NumThreads), NumBlocks(NumBlocks), + Barrier(NumThreads, []() {}), SyncThreads(NumThreads, []() {}), + NamedBarrier(NumThreads, []() {}) {} + + unsigned getId() const { return ID; } + unsigned getNumThreads() const { return NumThreads; } + + unsigned getNumBlocks() const { return NumBlocks; } + + void fence() { Barrier.arrive_and_wait(); } + void syncThreads() { SyncThreads.arrive_and_wait(); } + void namedBarrier() { NamedBarrier.arrive_and_wait(); } +}; + +class ThreadBlockEnvironmentTy { + unsigned ID; + unsigned NumBlocks; + +public: + ThreadBlockEnvironmentTy(unsigned ID, unsigned NumBlocks) + : ID(ID), NumBlocks(NumBlocks) {} + + unsigned getId() const { return ID; } + unsigned getNumBlocks() const { return NumBlocks; } +}; + +namespace VGPUImpl { +class ThreadEnvironmentTy { + unsigned ThreadIdInWarp; + unsigned ThreadIdInBlock; + unsigned GlobalThreadIdx; + + WarpEnvironmentTy *WarpEnvironment; + ThreadBlockEnvironmentTy *ThreadBlockEnvironment; + CTAEnvironmentTy *CTAEnvironment; + +public: + ThreadEnvironmentTy(unsigned ThreadId, WarpEnvironmentTy *WE, + CTAEnvironmentTy *CTAE) + : ThreadIdInWarp(ThreadId), + ThreadIdInBlock(WE->getWarpId() * WE->getNumThreads() + ThreadId), + GlobalThreadIdx(CTAE->getId() * CTAE->getNumThreads() + + ThreadIdInBlock), + WarpEnvironment(WE), CTAEnvironment(CTAE) {} + + void setBlockEnv(ThreadBlockEnvironmentTy *TBE) { + ThreadBlockEnvironment = TBE; + } + + void resetBlockEnv() { + delete ThreadBlockEnvironment; + ThreadBlockEnvironment = nullptr; + } + + unsigned getThreadIdInWarp() const { return ThreadIdInWarp; } + unsigned getThreadIdInBlock() const { return ThreadIdInBlock; } + unsigned getGlobalThreadId() const { return GlobalThreadIdx; } + + unsigned getBlockSize() const { return CTAEnvironment->getNumThreads(); } + + unsigned getBlockId() const { return ThreadBlockEnvironment->getId(); } + + unsigned getNumberOfBlocks() const { + return ThreadBlockEnvironment->getNumBlocks(); + } + unsigned getKernelSize() const {} + + // FIXME: This is wrong + LaneMaskTy getActiveMask() const { return ~0U; } + + void fenceTeam() { CTAEnvironment->fence(); } + void syncWarp() { WarpEnvironment->sync(); } + + int32_t shuffle(int32_t Var, uint64_t SrcLane) { + WarpEnvironment->waitShuffleBarrier(); + WarpEnvironment->writeShuffleBuffer(Var, ThreadIdInWarp); + WarpEnvironment->waitShuffleBarrier(); + Var = WarpEnvironment->getShuffleBuffer(ThreadIdInWarp); + return Var; + } + + int32_t shuffleDown(int32_t Var, uint32_t Delta) { + WarpEnvironment->waitShuffleDownBarrier(); + WarpEnvironment->writeShuffleBuffer(Var, ThreadIdInWarp); + WarpEnvironment->waitShuffleDownBarrier(); + Var = WarpEnvironment->getShuffleBuffer((ThreadIdInWarp + Delta) % + getWarpSize()); + return Var; + } + + void namedBarrier(bool Generic) { + if (Generic) { + CTAEnvironment->namedBarrier(); + } else { + CTAEnvironment->syncThreads(); + } + } + + void fenceKernel(int32_t MemoryOrder) { + std::atomic_thread_fence(static_cast<std::memory_order>(MemoryOrder)); + } + + unsigned getWarpSize() const { return WarpEnvironment->getNumThreads(); } +}; +} // namespace VGPUImpl + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENTIMPL_H Index: openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.h =================================================================== --- /dev/null +++ openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.h @@ -0,0 +1,72 @@ +//===---- ThreadEnvironment.h - Virtual GPU thread environment ----- C++ --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENT_H +#define OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENT_H + +using LaneMaskTy = uint64_t; + +// Forward declaration +class WarpEnvironmentTy; +class ThreadBlockEnvironmentTy; +class CTAEnvironmentTy; +namespace VGPUImpl { +class ThreadEnvironmentTy; +void setLock(uint32_t *Lock, uint32_t Unset, uint32_t Set, uint32_t OmpSpin, + uint32_t BlockId, + uint32_t(atomicCAS)(uint32_t *, uint32_t, uint32_t, int)); +uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering); +} // namespace VGPUImpl + +class ThreadEnvironmentTy { + VGPUImpl::ThreadEnvironmentTy *Impl; + +public: + ThreadEnvironmentTy(unsigned Id, WarpEnvironmentTy *WE, + CTAEnvironmentTy *CTAE); + + ~ThreadEnvironmentTy(); + + unsigned getThreadIdInWarp() const; + + unsigned getThreadIdInBlock() const; + + unsigned getGlobalThreadId() const; + + unsigned getBlockSize() const; + + unsigned getKernelSize() const; + + unsigned getBlockId() const; + + unsigned getNumberOfBlocks() const; + + LaneMaskTy getActiveMask() const; + + unsigned getWarpSize() const; + + int32_t shuffle(int32_t Var, uint64_t SrcLane); + + int32_t shuffleDown(int32_t Var, uint32_t Delta); + + void fenceKernel(int32_t MemoryOrder); + + void fenceTeam(); + + void syncWarp(); + + void namedBarrier(bool Generic); + + void setBlockEnv(ThreadBlockEnvironmentTy *TBE); + + void resetBlockEnv(); +}; + +ThreadEnvironmentTy *getThreadEnvironment(void); + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_VGPU_SRC_THREADENVIRONMENT_H Index: openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.cpp =================================================================== --- /dev/null +++ openmp/libomptarget/plugins/vgpu/src/ThreadEnvironment.cpp @@ -0,0 +1,119 @@ +//===---- DeviceEnvironment.cpp - Virtual GPU Device Environment -- C++ ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implementation of VGPU environment classes. +// +//===----------------------------------------------------------------------===// +// +#include <cstdint> + +#include "ThreadEnvironment.h" +#include "ThreadEnvironmentImpl.h" +#include <barrier> +#include <mutex> + +std::mutex AtomicIncLock; + +uint32_t VGPUImpl::atomicInc(uint32_t *Address, uint32_t Val, int Ordering) { + std::lock_guard G(AtomicIncLock); + uint32_t V = *Address; + if (V >= Val) + *Address = 0; + else + *Address += 1; + return V; +} + +void VGPUImpl::setLock(uint32_t *Lock, uint32_t Unset, uint32_t Set, + uint32_t OmpSpin, uint32_t BlockId, + uint32_t(atomicCAS)(uint32_t *, uint32_t, uint32_t, + int)) { + // TODO: not sure spinning is a good idea here.. + while (atomicCAS((uint32_t *)Lock, Unset, Set, __ATOMIC_SEQ_CST) != Unset) { + std::clock_t start = std::clock(); + std::clock_t now; + for (;;) { + now = std::clock(); + std::clock_t cycles = + now > start ? now - start : now + (0xffffffff - start); + if (cycles >= 1000 * BlockId) { + break; + } + } + } // wait for 0 to be the read value +} + +extern thread_local ThreadEnvironmentTy *ThreadEnvironment; + +ThreadEnvironmentTy *getThreadEnvironment() { return ThreadEnvironment; } + +ThreadEnvironmentTy::ThreadEnvironmentTy(unsigned Id, WarpEnvironmentTy *WE, + CTAEnvironmentTy *CTAE) + : Impl(new VGPUImpl::ThreadEnvironmentTy(Id, WE, CTAE)) {} + +ThreadEnvironmentTy::~ThreadEnvironmentTy() { delete Impl; } + +void ThreadEnvironmentTy::fenceTeam() { Impl->fenceTeam(); } + +void ThreadEnvironmentTy::syncWarp() { Impl->syncWarp(); } + +unsigned ThreadEnvironmentTy::getThreadIdInWarp() const { + return Impl->getThreadIdInWarp(); +} + +unsigned ThreadEnvironmentTy::getThreadIdInBlock() const { + return Impl->getThreadIdInBlock(); +} + +unsigned ThreadEnvironmentTy::getGlobalThreadId() const { + return Impl->getGlobalThreadId(); +} + +unsigned ThreadEnvironmentTy::getBlockSize() const { + return Impl->getBlockSize(); +} + +unsigned ThreadEnvironmentTy::getKernelSize() const { + return Impl->getKernelSize(); +} + +unsigned ThreadEnvironmentTy::getBlockId() const { return Impl->getBlockId(); } + +unsigned ThreadEnvironmentTy::getNumberOfBlocks() const { + return Impl->getNumberOfBlocks(); +} + +LaneMaskTy ThreadEnvironmentTy::getActiveMask() const { + return Impl->getActiveMask(); +} + +int32_t ThreadEnvironmentTy::shuffle(int32_t Var, uint64_t SrcLane) { + return Impl->shuffle(Var, SrcLane); +} + +int32_t ThreadEnvironmentTy::shuffleDown(int32_t Var, uint32_t Delta) { + return Impl->shuffleDown(Var, Delta); +} + +void ThreadEnvironmentTy::fenceKernel(int32_t MemoryOrder) { + return Impl->fenceKernel(MemoryOrder); +} + +void ThreadEnvironmentTy::namedBarrier(bool Generic) { + Impl->namedBarrier(Generic); +} + +void ThreadEnvironmentTy::setBlockEnv(ThreadBlockEnvironmentTy *TBE) { + Impl->setBlockEnv(TBE); +} + +void ThreadEnvironmentTy::resetBlockEnv() { Impl->resetBlockEnv(); } + +unsigned ThreadEnvironmentTy::getWarpSize() const { + return Impl->getWarpSize(); +} Index: openmp/libomptarget/plugins/vgpu/CMakeLists.txt =================================================================== --- /dev/null +++ openmp/libomptarget/plugins/vgpu/CMakeLists.txt @@ -0,0 +1,59 @@ +set(tmachine_name "vgpu") +set(tmachine_libname "vgpu") +set(tmachine_triple "x86_64-vgpu") +set(elf_machine_id "62") + +if(LIBOMPTARGET_DEP_LIBELF_FOUND) + if(LIBOMPTARGET_DEP_LIBFFI_FOUND) + + libomptarget_say("Building ${tmachine_name} offloading plugin.") + + include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) + include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR}) + include_directories(${LIBOMPTARGET_INCLUDE_DIR}) + + # Define macro to be used as prefix of the runtime messages for this target. + add_definitions("-DTARGET_NAME=${tmachine_name}") + + # Define macro with the ELF ID for this target. + add_definitions("-DTARGET_ELF_ID=${elf_machine_id}") + + add_library("omptarget.rtl.${tmachine_libname}" SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/ThreadEnvironment.cpp) + + # Install plugin under the lib destination folder. + install(TARGETS "omptarget.rtl.${tmachine_libname}" + LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") + + set_target_properties("omptarget.rtl.${tmachine_libname}" PROPERTIES CXX_STANDARD 20) + target_compile_options("omptarget.rtl.${tmachine_libname}" PRIVATE "-stdlib=libc++") + + target_link_libraries( + "omptarget.rtl.${tmachine_libname}" + elf_common + ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} + ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} + dl + ${OPENMP_PTHREAD_LIB} + "-rdynamic" + c++ + -g + #"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" + ) + + list(APPEND LIBOMPTARGET_TESTED_PLUGINS + "omptarget.rtl.${tmachine_libname}") + + # Report to the parent scope that we are building a plugin. + set(LIBOMPTARGET_SYSTEM_TARGETS + "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE) + set(LIBOMPTARGET_TESTED_PLUGINS + "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) + + else(LIBOMPTARGET_DEP_LIBFFI_FOUND) + libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.") + endif(LIBOMPTARGET_DEP_LIBFFI_FOUND) +else(LIBOMPTARGET_DEP_LIBELF_FOUND) + libomptarget_say("Not building ${tmachine_name} offloading plugin: libelf dependency not found.") +endif(LIBOMPTARGET_DEP_LIBELF_FOUND) Index: openmp/libomptarget/plugins/CMakeLists.txt =================================================================== --- openmp/libomptarget/plugins/CMakeLists.txt +++ openmp/libomptarget/plugins/CMakeLists.txt @@ -75,6 +75,7 @@ add_subdirectory(ppc64) add_subdirectory(ppc64le) add_subdirectory(ve) +add_subdirectory(vgpu) add_subdirectory(x86_64) add_subdirectory(remote) Index: openmp/libomptarget/DeviceRTL/src/Utils.cpp =================================================================== --- openmp/libomptarget/DeviceRTL/src/Utils.cpp +++ openmp/libomptarget/DeviceRTL/src/Utils.cpp @@ -49,6 +49,24 @@ #pragma omp end declare variant +/// Virtual GPU Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {kind(cpu)}, implementation = {extension(match_any)}) + +void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) { + *LowBits = (uint32_t)(Val & static_cast<uint64_t>(0x00000000FFFFFFFF)); + *HighBits = + (uint32_t)((Val & static_cast<uint64_t>(0xFFFFFFFF00000000)) >> 32); +} + +uint64_t Pack(uint32_t LowBits, uint32_t HighBits) { + return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits; +} + +#pragma omp end declare variant + /// NVPTX Implementation /// ///{ @@ -113,6 +131,26 @@ #pragma omp end declare variant } // namespace impl +/// Virtual GPU Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {kind(cpu)}, implementation = {extension(match_any)}) + +#include "ThreadEnvironment.h" +namespace impl { + +int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) { + return getThreadEnvironment()->shuffle(Var, SrcLane); +} + +int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) { + return getThreadEnvironment()->shuffleDown(Var, Delta); +} + +} // namespace impl +#pragma omp end declare variant + uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) { return impl::Pack(LowBits, HighBits); } Index: openmp/libomptarget/DeviceRTL/src/Synchronization.cpp =================================================================== --- openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -283,6 +283,73 @@ } // namespace impl +/// Virtual GPU Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {kind(cpu)}, implementation = {extension(match_any)}) + +#include "ThreadEnvironment.h" +namespace impl { + +uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) { + return VGPUImpl::atomicInc(Address, Val, Ordering); +} + +void namedBarrierInit() {} + +void namedBarrier() { + uint32_t NumThreads = omp_get_num_threads(); + ASSERT(NumThreads % mapping::getWarpSize() == 0); + getThreadEnvironment()->namedBarrier(true); +} + +void fenceTeam(int) { getThreadEnvironment()->fenceTeam(); } + +void fenceKernel(int memory_order) { + getThreadEnvironment()->fenceKernel(memory_order); +} + +// Simply call fenceKernel because there is no need to sync with host +void fenceSystem(int) { fenceKernel(0); } + +void syncWarp(__kmpc_impl_lanemask_t Mask) { + getThreadEnvironment()->syncWarp(); +} + +void syncThreads() { getThreadEnvironment()->namedBarrier(false); } + +constexpr uint32_t OMP_SPIN = 1000; +constexpr uint32_t UNSET = 0; +constexpr uint32_t SET = 1; + +// TODO: This seems to hide a bug in the declare variant handling. If it is +// called before it is defined +// here the overload won't happen. Investigate lalter! +void unsetLock(omp_lock_t *Lock) { + (void)atomicExchange((uint32_t *)Lock, UNSET, __ATOMIC_SEQ_CST); +} + +int testLock(omp_lock_t *Lock) { + return atomicAdd((uint32_t *)Lock, 0u, __ATOMIC_SEQ_CST); +} + +void initLock(omp_lock_t *Lock) { unsetLock(Lock); } + +void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); } + +void setLock(omp_lock_t *Lock) { + VGPUImpl::setLock((uint32_t *)Lock, UNSET, SET, OMP_SPIN, + mapping::getBlockId(), atomicCAS); +} + +void syncThreadsAligned() {} + +} // namespace impl + +#pragma omp end declare variant +///} + void synchronize::init(bool IsSPMD) { if (!IsSPMD) impl::namedBarrierInit(); Index: openmp/libomptarget/DeviceRTL/src/Misc.cpp =================================================================== --- openmp/libomptarget/DeviceRTL/src/Misc.cpp +++ openmp/libomptarget/DeviceRTL/src/Misc.cpp @@ -18,10 +18,9 @@ namespace _OMP { namespace impl { -/// AMDGCN Implementation +/// Generic Implementation - AMDGCN, VGPU /// ///{ -#pragma omp begin declare variant match(device = {arch(amdgcn)}) double getWTick() { return ((double)1E-9); } @@ -33,8 +32,6 @@ return 0; } -#pragma omp end declare variant - /// NVPTX Implementation /// ///{ Index: openmp/libomptarget/DeviceRTL/src/Mapping.cpp =================================================================== --- openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -21,6 +21,83 @@ using namespace _OMP; +/// Virtual GPU Implementation +/// +///{ +#pragma omp begin declare variant match( \ + device = {kind(cpu)}, implementation = {extension(match_any)}) + +#include "ThreadEnvironment.h" + +namespace _OMP { +namespace impl { + +constexpr const llvm::omp::GV &getGridValue() { + return llvm::omp::VirtualGpuGridValues; +} + +LaneMaskTy activemask() { + uint64_t B = 0; + uint32_t N = mapping::getWarpSize(); + while (N) + B |= (1 << (--N)); + return B; +} + +LaneMaskTy lanemaskLT() { + const uint32_t Lane = mapping::getThreadIdInWarp(); + LaneMaskTy Ballot = mapping::activemask(); + LaneMaskTy Mask = ((LaneMaskTy)1 << Lane) - (LaneMaskTy)1; + return Mask & Ballot; +} + +LaneMaskTy lanemaskGT() { + const uint32_t Lane = mapping::getThreadIdInWarp(); + if (Lane == (mapping::getWarpSize() - 1)) + return 0; + LaneMaskTy Ballot = mapping::activemask(); + LaneMaskTy Mask = (~((LaneMaskTy)0)) << (Lane + 1); + return Mask & Ballot; +} + +uint32_t getThreadIdInWarp() { + return mapping::getThreadIdInBlock() & (mapping::getWarpSize() - 1); +} + +uint32_t getThreadIdInBlock() { + return getThreadEnvironment()->getThreadIdInBlock(); +} + +uint32_t getNumHardwareThreadsInBlock() { + return getThreadEnvironment()->getBlockSize(); +} + +uint32_t getKernelSize() { return getThreadEnvironment()->getKernelSize(); } + +uint32_t getBlockId() { return getThreadEnvironment()->getBlockId(); } + +uint32_t getNumberOfBlocks() { + return getThreadEnvironment()->getNumberOfBlocks(); +} + +uint32_t getNumberOfProcessorElements() { return mapping::getBlockSize(); } + +uint32_t getWarpId() { + return mapping::getThreadIdInBlock() / mapping::getWarpSize(); +} + +uint32_t getWarpSize() { return getThreadEnvironment()->getWarpSize(); } + +uint32_t getNumberOfWarpsInBlock() { + return (mapping::getBlockSize() + mapping::getWarpSize() - 1) / + mapping::getWarpSize(); +} + +} // namespace impl +} // namespace _OMP + +#pragma omp end declare variant + namespace _OMP { namespace impl { Index: openmp/libomptarget/DeviceRTL/src/Kernel.cpp =================================================================== --- openmp/libomptarget/DeviceRTL/src/Kernel.cpp +++ openmp/libomptarget/DeviceRTL/src/Kernel.cpp @@ -124,6 +124,22 @@ state::ParallelRegionFn = nullptr; } +#pragma omp begin declare variant match( \ + device = {kind(cpu)}, implementation = {extension(match_any)}) +void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode, bool) { + FunctionTracingRAII(); + const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD; + state::assumeInitialState(IsSPMD); + if (IsSPMD) + return; + + // Signal the workers to exit the state machine and exit the kernel. + state::ParallelRegionFn = nullptr; + + synchronize::threads(); +} +#pragma omp end declare variant + int8_t __kmpc_is_spmd_exec_mode() { FunctionTracingRAII(); return mapping::isSPMDMode(); Index: openmp/libomptarget/DeviceRTL/src/Debug.cpp =================================================================== --- openmp/libomptarget/DeviceRTL/src/Debug.cpp +++ openmp/libomptarget/DeviceRTL/src/Debug.cpp @@ -49,6 +49,16 @@ } // namespace impl #pragma omp end declare variant +#pragma omp begin declare variant match( \ + device = {kind(cpu)}, implementation = {extension(match_any)}) +int32_t vprintf(const char *, void *); +namespace impl { +static int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) { + return vprintf(Format, Arguments); +} +} // namespace impl +#pragma omp end declare variant + int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) { return impl::omp_vprintf(Format, Arguments, Size); } Index: openmp/libomptarget/DeviceRTL/include/ThreadEnvironment.h =================================================================== --- /dev/null +++ openmp/libomptarget/DeviceRTL/include/ThreadEnvironment.h @@ -0,0 +1,11 @@ +//===--- ThreadEnvironment.h - OpenMP VGPU Dummy Header File ------ C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Dummy header file to avoid preprocessor errors in device runtime. +// +//===----------------------------------------------------------------------===// Index: openmp/libomptarget/DeviceRTL/CMakeLists.txt =================================================================== --- openmp/libomptarget/DeviceRTL/CMakeLists.txt +++ openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -152,9 +152,8 @@ add_custom_command(OUTPUT ${outfile} COMMAND ${CLANG_TOOL} - ${bc_flags} - -Xclang -target-cpu -Xclang ${target_cpu} ${target_bc_flags} + ${bc_flags} ${infile} -o ${outfile} DEPENDS ${infile} IMPLICIT_DEPENDS CXX ${infile} @@ -222,9 +221,11 @@ # Generate a Bitcode library for all the compute capabilities the user requested foreach(sm ${nvptx_sm_list}) - compileDeviceRTLLibrary(sm_${sm} nvptx -target nvptx64 -Xclang -target-feature -Xclang +ptx61 "-D__CUDA_ARCH__=${sm}0") + compileDeviceRTLLibrary(sm_${sm} nvptx -Xclang -target-cpu -Xclang sm_${sm} -target nvptx64 -Xclang -target-feature -Xclang +ptx61 "-D__CUDA_ARCH__=${sm}0") endforeach() foreach(mcpu ${amdgpu_mcpus}) - compileDeviceRTLLibrary(${mcpu} amdgpu -target amdgcn-amd-amdhsa -D__AMDGCN__ -fvisibility=default -nogpulib) + compileDeviceRTLLibrary(${mcpu} amdgpu -Xclang -target-cpu -Xclang ${mcpu} -target amdgcn-amd-amdhsa -D__AMDGCN__ -fvisibility=default -nogpulib) endforeach() + +compileDeviceRTLLibrary(x86_64 vgpu -target x86_64-vgpu -std=c++20 -stdlib=libc++ -I${devicertl_base_directory}/../plugins/vgpu/src) Index: openmp/CMakeLists.txt =================================================================== --- openmp/CMakeLists.txt +++ openmp/CMakeLists.txt @@ -39,6 +39,8 @@ set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang.exe) set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++.exe) endif() + + list(APPEND LIBOMPTARGET_LLVM_INCLUDE_DIRS ${LLVM_MAIN_INCLUDE_DIR} ${LLVM_BINARY_DIR}/include) endif() # Check and set up common compiler flags. Index: llvm/lib/Support/Triple.cpp =================================================================== --- llvm/lib/Support/Triple.cpp +++ llvm/lib/Support/Triple.cpp @@ -185,6 +185,8 @@ case PC: return "pc"; case SCEI: return "scei"; case SUSE: return "suse"; + case OpenMP_VGPU: + return "openmp_vgpu"; } llvm_unreachable("Invalid VendorType!"); @@ -492,22 +494,23 @@ static Triple::VendorType parseVendor(StringRef VendorName) { return StringSwitch<Triple::VendorType>(VendorName) - .Case("apple", Triple::Apple) - .Case("pc", Triple::PC) - .Case("scei", Triple::SCEI) - .Case("sie", Triple::SCEI) - .Case("fsl", Triple::Freescale) - .Case("ibm", Triple::IBM) - .Case("img", Triple::ImaginationTechnologies) - .Case("mti", Triple::MipsTechnologies) - .Case("nvidia", Triple::NVIDIA) - .Case("csr", Triple::CSR) - .Case("myriad", Triple::Myriad) - .Case("amd", Triple::AMD) - .Case("mesa", Triple::Mesa) - .Case("suse", Triple::SUSE) - .Case("oe", Triple::OpenEmbedded) - .Default(Triple::UnknownVendor); + .Case("apple", Triple::Apple) + .Case("pc", Triple::PC) + .Case("scei", Triple::SCEI) + .Case("sie", Triple::SCEI) + .Case("fsl", Triple::Freescale) + .Case("ibm", Triple::IBM) + .Case("img", Triple::ImaginationTechnologies) + .Case("mti", Triple::MipsTechnologies) + .Case("nvidia", Triple::NVIDIA) + .Case("csr", Triple::CSR) + .Case("myriad", Triple::Myriad) + .Case("amd", Triple::AMD) + .Case("mesa", Triple::Mesa) + .Case("suse", Triple::SUSE) + .Case("oe", Triple::OpenEmbedded) + .Case("vgpu", Triple::OpenMP_VGPU) + .Default(Triple::UnknownVendor); } static Triple::OSType parseOS(StringRef OSName) { Index: llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h =================================================================== --- llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h +++ llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h @@ -114,6 +114,38 @@ 128, // GV_Default_WG_Size }; +/// For Virtual GPUs +static constexpr GV VirtualGpuGridValues = { + 256, // GV_Slot_Size + 32, // GV_Warp_Size + 1024, // GV_Max_Teams + 896, // GV_SimpleBufferSize + 1024, // GV_Max_WG_Size + 128, // GV_Defaut_WG_Size +}; + +static const unsigned OpenMPVGPUAddrSpaceMap[] = { + 0, // Default + 1, // opencl_global + 3, // opencl_local + 4, // opencl_constant + 0, // opencl_private + 0, // opencl_generic + 1, // opencl_global_device + 1, // opencl_global_host + 1, // cuda_device + 4, // cuda_constant + 3, // cuda_shared + 1, // sycl_global + 0, // sycl_global_device + 0, // sycl_global_host + 3, // sycl_local + 0, // sycl_private + 270, // ptr32_sptr + 271, // ptr32_uptr + 272 // ptr64 +}; + } // namespace omp } // namespace llvm Index: llvm/include/llvm/ADT/Triple.h =================================================================== --- llvm/include/llvm/ADT/Triple.h +++ llvm/include/llvm/ADT/Triple.h @@ -164,7 +164,8 @@ Mesa, SUSE, OpenEmbedded, - LastVendorType = OpenEmbedded + OpenMP_VGPU, + LastVendorType = OpenMP_VGPU }; enum OSType { UnknownOS, @@ -692,6 +693,11 @@ return getArch() == Triple::nvptx || getArch() == Triple::nvptx64; } + /// Tests whether the target is OpenMP VGPU. + bool isOpenMPVGPU() const { + return getVendor() == llvm::Triple::OpenMP_VGPU; + } + /// Tests whether the target is AMDGCN bool isAMDGCN() const { return getArch() == Triple::amdgcn; } Index: clang/lib/Frontend/CompilerInvocation.cpp =================================================================== --- clang/lib/Frontend/CompilerInvocation.cpp +++ clang/lib/Frontend/CompilerInvocation.cpp @@ -3979,7 +3979,8 @@ } // Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options - Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) && + Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice && + (T.isNVPTX() || T.isAMDGCN() || T.isOpenMPVGPU()) && Args.hasArg(options::OPT_fopenmp_cuda_mode); // Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options Index: clang/lib/Driver/ToolChains/Gnu.cpp =================================================================== --- clang/lib/Driver/ToolChains/Gnu.cpp +++ clang/lib/Driver/ToolChains/Gnu.cpp @@ -3067,4 +3067,13 @@ if (!DriverArgs.hasFlag(options::OPT_fuse_init_array, options::OPT_fno_use_init_array, true)) CC1Args.push_back("-fno-use-init-array"); + + if (DriverArgs.hasArg(options::OPT_S)) + return; + + if (getTriple().getVendor() == llvm::Triple::OpenMP_VGPU) { + std::string BitcodeSuffix = "x86_64-openmp_vgpu"; + clang::driver::tools::addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, + BitcodeSuffix, getTriple()); + } } Index: clang/lib/CodeGen/CodeGenModule.cpp =================================================================== --- clang/lib/CodeGen/CodeGenModule.cpp +++ clang/lib/CodeGen/CodeGenModule.cpp @@ -249,7 +249,9 @@ OpenMPRuntime.reset(new CGOpenMPRuntimeGPU(*this)); break; default: - if (LangOpts.OpenMPSimd) + if (getTriple().getVendor() == llvm::Triple::OpenMP_VGPU) { + OpenMPRuntime.reset(new CGOpenMPRuntimeGPU(*this)); + } else if (LangOpts.OpenMPSimd) OpenMPRuntime.reset(new CGOpenMPSIMDRuntime(*this)); else OpenMPRuntime.reset(new CGOpenMPRuntime(*this)); Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp =================================================================== --- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1119,10 +1119,11 @@ CGM.addCompilerUsedGlobal(GVMode); } -void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID, - llvm::Constant *Addr, - uint64_t Size, int32_t, - llvm::GlobalValue::LinkageTypes) { +void CGOpenMPRuntimeGPU::createOffloadEntry( + llvm::Constant *ID, llvm::Constant *Addr, uint64_t Size, int32_t Flags, + llvm::GlobalValue::LinkageTypes Linkage) { + if (CGM.getTarget().getTriple().getVendor() == llvm::Triple::OpenMP_VGPU) + return CGOpenMPRuntime::createOffloadEntry(ID, Addr, Size, Flags, Linkage); // TODO: Add support for global variables on the device after declare target // support. if (!isa<llvm::Function>(Addr)) Index: clang/lib/Basic/Targets/X86.h =================================================================== --- clang/lib/Basic/Targets/X86.h +++ clang/lib/Basic/Targets/X86.h @@ -17,6 +17,7 @@ #include "clang/Basic/TargetInfo.h" #include "clang/Basic/TargetOptions.h" #include "llvm/ADT/Triple.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/X86TargetParser.h" @@ -388,6 +389,10 @@ uint64_t getPointerAlignV(unsigned AddrSpace) const override { return getPointerWidthV(AddrSpace); } + + const llvm::omp::GV &getGridValue() const override { + return llvm::omp::VirtualGpuGridValues; + } }; // X86-32 generic target Index: clang/lib/Basic/TargetInfo.cpp =================================================================== --- clang/lib/Basic/TargetInfo.cpp +++ clang/lib/Basic/TargetInfo.cpp @@ -150,6 +150,9 @@ PlatformMinVersion = VersionTuple(); MaxOpenCLWorkGroupSize = 1024; + + if (Triple.getVendor() == llvm::Triple::OpenMP_VGPU) + AddrSpaceMap = &llvm::omp::OpenMPVGPUAddrSpaceMap; } // Out of line virtual dtor for TargetInfo.
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits