llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-pgo Author: Ethan Luis McDonough (EthanLuisMcDonough) <details> <summary>Changes</summary> This pull request is the first part of an ongoing effort to extends PGO instrumentation to GPU device code. This PR makes the following changes: - Adds blank registration functions to device RTL - Gives PGO globals protected visibility when targeting a supported GPU - Handles any addrspace casts for PGO calls - Implements PGO global extraction in GPU plugins (currently only dumps info) These changes can be tested by supplying `-fprofile-instrument=clang` while targeting a GPU. --- Full diff: https://github.com/llvm/llvm-project/pull/76587.diff 11 Files Affected: - (modified) clang/lib/CodeGen/CodeGenPGO.cpp (+8-2) - (modified) llvm/include/llvm/Frontend/OpenMP/OMPKinds.def (+3) - (modified) llvm/include/llvm/ProfileData/InstrProf.h (+4) - (modified) llvm/lib/ProfileData/InstrProf.cpp (+15-2) - (modified) llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp (+26-7) - (modified) openmp/libomptarget/DeviceRTL/CMakeLists.txt (+2) - (added) openmp/libomptarget/DeviceRTL/include/Profiling.h (+21) - (added) openmp/libomptarget/DeviceRTL/src/Profiling.cpp (+19) - (modified) openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h (+27) - (modified) openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp (+82) - (modified) openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp (+14) ``````````diff diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp index 81bf8ea696b164..edae6885b528ac 100644 --- a/clang/lib/CodeGen/CodeGenPGO.cpp +++ b/clang/lib/CodeGen/CodeGenPGO.cpp @@ -959,8 +959,14 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S, unsigned Counter = (*RegionCounterMap)[S]; - llvm::Value *Args[] = {FuncNameVar, - Builder.getInt64(FunctionHash), + // Make sure that pointer to global is passed in with zero addrspace + // This is relevant during GPU profiling + auto *I8Ty = llvm::Type::getInt8Ty(CGM.getLLVMContext()); + auto *I8PtrTy = llvm::PointerType::getUnqual(I8Ty); + auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( + FuncNameVar, I8PtrTy); + + llvm::Value *Args[] = {NormalizedPtr, Builder.getInt64(FunctionHash), Builder.getInt32(NumRegionCounters), Builder.getInt32(Counter), StepV}; if (!StepV) diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index d22d2a8e948b00..1d887d5cb58127 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -503,6 +503,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32) __OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,) __OMP_RTL(__kmpc_syncwarp, false, Void, Int64) +__OMP_RTL(__llvm_profile_register_function, false, Void, VoidPtr) +__OMP_RTL(__llvm_profile_register_names_function, false, Void, VoidPtr, Int64) + __OMP_RTL(__last, false, Void, ) #undef __OMP_RTL diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h index 36be2e7d869e7b..32648e4a67ad9e 100644 --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -171,6 +171,10 @@ inline StringRef getInstrProfCounterBiasVarName() { /// Return the marker used to separate PGO names during serialization. inline StringRef getInstrProfNameSeparator() { return "\01"; } +/// Determines whether module targets a GPU eligable for PGO +/// instrumentation +bool isGPUProfTarget(const Module &M); + /// Please use getIRPGOFuncName for LLVM IR instrumentation. This function is /// for front-end (Clang, etc) instrumentation. /// Return the modified name for function \c F suitable to be diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index 134a400e639c4b..cdcd6840bb5108 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -428,13 +428,22 @@ std::string getPGOFuncNameVarName(StringRef FuncName, return VarName; } +bool isGPUProfTarget(const Module &M) { + const auto &triple = M.getTargetTriple(); + return triple.rfind("nvptx", 0) == 0 || triple.rfind("amdgcn", 0) == 0 || + triple.rfind("r600", 0) == 0; +} + GlobalVariable *createPGOFuncNameVar(Module &M, GlobalValue::LinkageTypes Linkage, StringRef PGOFuncName) { + // Ensure profiling variables on GPU are visible to be read from host + if (isGPUProfTarget(M)) + Linkage = GlobalValue::ExternalLinkage; // We generally want to match the function's linkage, but available_externally // and extern_weak both have the wrong semantics, and anything that doesn't // need to link across compilation units doesn't need to be visible at all. - if (Linkage == GlobalValue::ExternalWeakLinkage) + else if (Linkage == GlobalValue::ExternalWeakLinkage) Linkage = GlobalValue::LinkOnceAnyLinkage; else if (Linkage == GlobalValue::AvailableExternallyLinkage) Linkage = GlobalValue::LinkOnceODRLinkage; @@ -448,8 +457,12 @@ GlobalVariable *createPGOFuncNameVar(Module &M, new GlobalVariable(M, Value->getType(), true, Linkage, Value, getPGOFuncNameVarName(PGOFuncName, Linkage)); + // If the target is a GPU, make the symbol protected so it can + // be read from the host device + if (isGPUProfTarget(M)) + FuncNameVar->setVisibility(GlobalValue::ProtectedVisibility); // Hide the symbol so that we correctly get a copy for each executable. - if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage())) + else if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage())) FuncNameVar->setVisibility(GlobalValue::HiddenVisibility); return FuncNameVar; diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index fe5a0578bd9721..61fba7be3ee0ee 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -1481,6 +1481,10 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]); + if (isGPUProfTarget(M)) { + Linkage = GlobalValue::ExternalLinkage; + Visibility = GlobalValue::ProtectedVisibility; + } // If the data variable is not referenced by code (if we don't emit // @llvm.instrprof.value.profile, NS will be 0), and the counter keeps the // data variable live under linker GC, the data variable can be private. This @@ -1492,9 +1496,9 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { // If profd is in a deduplicate comdat, NS==0 with a hash suffix guarantees // that other copies must have the same CFG and cannot have value profiling. // If no hash suffix, other profd copies may be referenced by code. - if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) && - (TT.isOSBinFormatELF() || - (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) { + else if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) && + (TT.isOSBinFormatELF() || + (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) { Linkage = GlobalValue::PrivateLinkage; Visibility = GlobalValue::DefaultVisibility; } @@ -1617,6 +1621,13 @@ void InstrLowerer::emitNameData() { NamesVar = new GlobalVariable(M, NamesVal->getType(), true, GlobalValue::PrivateLinkage, NamesVal, getInstrProfNamesVarName()); + + // Make names variable public if current target is a GPU + if (isGPUProfTarget(M)) { + NamesVar->setLinkage(GlobalValue::ExternalLinkage); + NamesVar->setVisibility(GlobalValue::VisibilityTypes::ProtectedVisibility); + } + NamesSize = CompressedNameStr.size(); setGlobalVariableLargeSection(TT, *NamesVar); NamesVar->setSection( @@ -1658,10 +1669,13 @@ void InstrLowerer::emitRegistration() { IRBuilder<> IRB(BasicBlock::Create(M.getContext(), "", RegisterF)); for (Value *Data : CompilerUsedVars) if (!isa<Function>(Data)) - IRB.CreateCall(RuntimeRegisterF, Data); + // Check for addrspace cast when profiling GPU + IRB.CreateCall(RuntimeRegisterF, + IRB.CreatePointerBitCastOrAddrSpaceCast(Data, VoidPtrTy)); for (Value *Data : UsedVars) if (Data != NamesVar && !isa<Function>(Data)) - IRB.CreateCall(RuntimeRegisterF, Data); + IRB.CreateCall(RuntimeRegisterF, + IRB.CreatePointerBitCastOrAddrSpaceCast(Data, VoidPtrTy)); if (NamesVar) { Type *ParamTypes[] = {VoidPtrTy, Int64Ty}; @@ -1670,7 +1684,9 @@ void InstrLowerer::emitRegistration() { auto *NamesRegisterF = Function::Create(NamesRegisterTy, GlobalVariable::ExternalLinkage, getInstrProfNamesRegFuncName(), M); - IRB.CreateCall(NamesRegisterF, {NamesVar, IRB.getInt64(NamesSize)}); + IRB.CreateCall(NamesRegisterF, {IRB.CreatePointerBitCastOrAddrSpaceCast( + NamesVar, VoidPtrTy), + IRB.getInt64(NamesSize)}); } IRB.CreateRetVoid(); @@ -1691,7 +1707,10 @@ bool InstrLowerer::emitRuntimeHook() { auto *Var = new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage, nullptr, getInstrProfRuntimeHookVarName()); - Var->setVisibility(GlobalValue::HiddenVisibility); + if (isGPUProfTarget(M)) + Var->setVisibility(GlobalValue::ProtectedVisibility); + else + Var->setVisibility(GlobalValue::HiddenVisibility); if (TT.isOSBinFormatELF() && !TT.isPS()) { // Mark the user variable as used so that it isn't stripped out. diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt index 1ce3e1e40a80ab..55ee15d068c67b 100644 --- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt +++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -89,6 +89,7 @@ set(include_files ${include_directory}/Interface.h ${include_directory}/LibC.h ${include_directory}/Mapping.h + ${include_directory}/Profiling.h ${include_directory}/State.h ${include_directory}/Synchronization.h ${include_directory}/Types.h @@ -104,6 +105,7 @@ set(src_files ${source_directory}/Mapping.cpp ${source_directory}/Misc.cpp ${source_directory}/Parallelism.cpp + ${source_directory}/Profiling.cpp ${source_directory}/Reduction.cpp ${source_directory}/State.cpp ${source_directory}/Synchronization.cpp diff --git a/openmp/libomptarget/DeviceRTL/include/Profiling.h b/openmp/libomptarget/DeviceRTL/include/Profiling.h new file mode 100644 index 00000000000000..68c7744cd60752 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/Profiling.h @@ -0,0 +1,21 @@ +//===-------- Profiling.h - OpenMP interface ---------------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_DEVICERTL_PROFILING_H +#define OMPTARGET_DEVICERTL_PROFILING_H + +extern "C" { + +void __llvm_profile_register_function(void *ptr); +void __llvm_profile_register_names_function(void *ptr, long int i); +} + +#endif diff --git a/openmp/libomptarget/DeviceRTL/src/Profiling.cpp b/openmp/libomptarget/DeviceRTL/src/Profiling.cpp new file mode 100644 index 00000000000000..799477f5e47d27 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Profiling.cpp @@ -0,0 +1,19 @@ +//===------- Profiling.cpp ---------------------------------------- C++ ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Profiling.h" + +#pragma omp begin declare target device_type(nohost) + +extern "C" { + +void __llvm_profile_register_function(void *ptr) {} +void __llvm_profile_register_names_function(void *ptr, long int i) {} +} + +#pragma omp end declare target diff --git a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h index d9fe938790ca76..a803b3f76d8b25 100644 --- a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h +++ b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h @@ -14,9 +14,11 @@ #define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H #include <string> +#include <vector> #include "llvm/ADT/DenseMap.h" #include "llvm/Object/ELFObjectFile.h" +#include "llvm/ProfileData/InstrProf.h" #include "Shared/Debug.h" #include "Shared/Utils.h" @@ -58,6 +60,22 @@ class GlobalTy { void setPtr(void *P) { Ptr = P; } }; +typedef void *IntPtrT; +struct __llvm_profile_data { +#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) Type Name; +#include "llvm/ProfileData/InstrProfData.inc" +}; + +/// PGO profiling data extracted from a GPU device +struct GPUProfGlobals { + std::string names; + std::vector<std::vector<int64_t>> counts; + std::vector<__llvm_profile_data> data; + Triple targetTriple; + + void dump() const; +}; + /// Subclass of GlobalTy that holds the memory for a global of \p Ty. template <typename Ty> class StaticGlobalTy : public GlobalTy { Ty Data; @@ -167,6 +185,15 @@ class GenericGlobalHandlerTy { return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, /* D2H */ false); } + + /// Checks whether a given image contains profiling globals. + bool hasProfilingGlobals(GenericDeviceTy &Device, DeviceImageTy &Image); + + /// Reads profiling data from a GPU image to supplied profdata struct. + /// Iterates through the image symbol table and stores global values + /// with profiling prefixes. + Expected<GPUProfGlobals> readProfilingGlobals(GenericDeviceTy &Device, + DeviceImageTy &Image); }; } // namespace plugin diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp index d398f60c55bd13..cb71b61f4a9c4f 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp @@ -163,3 +163,85 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device, return Plugin::success(); } + +bool GenericGlobalHandlerTy::hasProfilingGlobals(GenericDeviceTy &Device, + DeviceImageTy &Image) { + GlobalTy global(getInstrProfNamesVarName().str(), 0); + if (auto Err = getGlobalMetadataFromImage(Device, Image, global)) { + consumeError(std::move(Err)); + return false; + } + return true; +} + +Expected<GPUProfGlobals> +GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device, + DeviceImageTy &Image) { + GPUProfGlobals profdata; + const auto *elf = getOrCreateELFObjectFile(Device, Image); + profdata.targetTriple = elf->makeTriple(); + // Iterate through + for (auto &sym : elf->symbols()) { + if (auto name = sym.getName()) { + // Check if given current global is a profiling global based + // on name + if (name->equals(getInstrProfNamesVarName())) { + // Read in profiled function names + std::vector<char> chars(sym.getSize() / sizeof(char), ' '); + GlobalTy NamesGlobal(name->str(), sym.getSize(), chars.data()); + if (auto Err = readGlobalFromDevice(Device, Image, NamesGlobal)) + return Err; + std::string names(chars.begin(), chars.end()); + profdata.names = std::move(names); + } else if (name->starts_with(getInstrProfCountersVarPrefix())) { + // Read global variable profiling counts + std::vector<int64_t> counts(sym.getSize() / sizeof(int64_t), 0); + GlobalTy CountGlobal(name->str(), sym.getSize(), counts.data()); + if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal)) + return Err; + profdata.counts.push_back(std::move(counts)); + } else if (name->starts_with(getInstrProfDataVarPrefix())) { + // Read profiling data for this global variable + __llvm_profile_data data{}; + GlobalTy DataGlobal(name->str(), sym.getSize(), &data); + if (auto Err = readGlobalFromDevice(Device, Image, DataGlobal)) + return Err; + profdata.data.push_back(std::move(data)); + } + } + } + return profdata; +} + +void GPUProfGlobals::dump() const { + llvm::outs() << "======= GPU Profile =======\nTarget: " << targetTriple.str() + << "\n"; + + llvm::outs() << "======== Counters =========\n"; + for (const auto &count : counts) { + llvm::outs() << "["; + for (size_t i = 0; i < count.size(); i++) { + if (i == 0) + llvm::outs() << " "; + llvm::outs() << count[i] << " "; + } + llvm::outs() << "]\n"; + } + + llvm::outs() << "========== Data ===========\n"; + for (const auto &d : data) { + llvm::outs() << "{ "; +#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) \ + llvm::outs() << d.Name << " "; +#include "llvm/ProfileData/InstrProfData.inc" + llvm::outs() << " }\n"; + } + + llvm::outs() << "======== Functions ========\n"; + InstrProfSymtab symtab; + if (Error Err = symtab.create(StringRef(names))) { + consumeError(std::move(Err)); + } + symtab.dumpNames(llvm::outs()); + llvm::outs() << "===========================\n"; +} diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp index 178c60a77ab51f..3d218570a49445 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp @@ -817,6 +817,20 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { DeviceMemoryPoolTracking.AllocationMax); } + for (auto *Image : LoadedImages) { + GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); + if (!Handler.hasProfilingGlobals(*this, *Image)) + continue; + + GPUProfGlobals profdata; + auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image); + if (!ProfOrErr) + return ProfOrErr.takeError(); + + // TODO: write data to profiling file + ProfOrErr->dump(); + } + // Delete the memory manager before deinitializing the device. Otherwise, // we may delete device allocations after the device is deinitialized. if (MemoryManager) `````````` </details> https://github.com/llvm/llvm-project/pull/76587 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits