https://github.com/fel-cab updated https://github.com/llvm/llvm-project/pull/68016
>From dd44de067c26ba94b6561c5ed7fa4a5d812a3d1a Mon Sep 17 00:00:00 2001 From: Felipe Cabarcas <cabar...@leia.crpl.cis.udel.edu> Date: Mon, 18 Sep 2023 12:07:12 +0000 Subject: [PATCH 1/9] testing Profiler features --- openmp/libomptarget/src/interface.cpp | 5 ++++- openmp/libomptarget/src/private.h | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp index 5f21b16b3fbfb1e..f64e1e268a3952e 100644 --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -252,7 +252,10 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>, "Target AsyncInfoTy must be convertible to AsyncInfoTy."); - TIMESCOPE_WITH_IDENT(Loc); + //TIMESCOPE_WITH_IDENT(Loc); + TIMESCOPE(); + //TIMESCOPE_WITH_NAME_AND_IDENT("Hello", Loc); + //TIMESCOPE_WITH_RTM_AND_IDENT("Hello", Loc); DP("Entering target region for device %" PRId64 " with entry point " DPxMOD "\n", diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h index cbce15b63a3eba2..dc6cd3944233955 100644 --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -433,7 +433,8 @@ class ExponentialBackoff { SourceInfo SI(IDENT); \ std::string ProfileLocation = SI.getProfileLocation(); \ std::string RTM = RegionTypeMsg; \ - llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM) + llvm::TimeTraceScope TimeScope(ProfileLocation, ProfileLocation + RTM) + //llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM) #else #define TIMESCOPE() #define TIMESCOPE_WITH_IDENT(IDENT) >From 92586bca6364100c7511ad38a30f41b0f86dea9c Mon Sep 17 00:00:00 2001 From: Felipe Cabarcas <cabar...@leia.crpl.cis.udel.edu> Date: Tue, 19 Sep 2023 12:02:53 +0000 Subject: [PATCH 2/9] Improve Profiler 1 --- llvm/lib/Support/TimeProfiler.cpp | 2 +- openmp/libomptarget/src/interface.cpp | 17 +++++++++-------- openmp/libomptarget/src/omptarget.cpp | 10 +++++----- openmp/libomptarget/src/private.h | 5 +++-- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp index 4d625b3eb5b1709..e1458116f64ab47 100644 --- a/llvm/lib/Support/TimeProfiler.cpp +++ b/llvm/lib/Support/TimeProfiler.cpp @@ -227,7 +227,7 @@ struct llvm::TimeTraceProfiler { J.attribute("ph", "X"); J.attribute("ts", 0); J.attribute("dur", DurUs); - J.attribute("name", "Total " + Total.first); + J.attribute("name", "Total: " + Total.first); J.attributeObject("args", [&] { J.attribute("count", int64_t(Count)); J.attribute("avg ms", int64_t(DurUs / Count / 1000)); diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp index f64e1e268a3952e..b8892cbe689107f 100644 --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -33,14 +33,14 @@ using namespace llvm::omp::target::ompt; //////////////////////////////////////////////////////////////////////////////// /// adds requires flags EXTERN void __tgt_register_requires(int64_t Flags) { - TIMESCOPE(); + //TIMESCOPE(); PM->RTLs.registerRequires(Flags); } //////////////////////////////////////////////////////////////////////////////// /// adds a target shared library to the target execution image EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) { - TIMESCOPE(); + //TIMESCOPE(); if (PM->maybeDelayRegisterLib(Desc)) return; @@ -61,7 +61,7 @@ EXTERN void __tgt_init_all_rtls() { PM->RTLs.initAllRTLs(); } //////////////////////////////////////////////////////////////////////////////// /// unloads a target shared library EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) { - TIMESCOPE(); + //TIMESCOPE(); PM->RTLs.unregisterLib(Desc); for (auto &RTL : PM->RTLs.UsedRTLs) { if (RTL->unregister_lib) { @@ -82,7 +82,8 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>, "TargetAsyncInfoTy must be convertible to AsyncInfoTy."); - TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc); + //TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc); + TIMESCOPE_WITH_RTM_AND_IDENT("targetData", Loc); DP("Entering data %s region for device %" PRId64 " with %d mappings\n", RegionName, DeviceId, ArgNum); @@ -253,9 +254,9 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, "Target AsyncInfoTy must be convertible to AsyncInfoTy."); //TIMESCOPE_WITH_IDENT(Loc); - TIMESCOPE(); + //TIMESCOPE(); //TIMESCOPE_WITH_NAME_AND_IDENT("Hello", Loc); - //TIMESCOPE_WITH_RTM_AND_IDENT("Hello", Loc); + //TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc); DP("Entering target region for device %" PRId64 " with entry point " DPxMOD "\n", @@ -411,7 +412,7 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, // Get the current number of components for a user-defined mapper. EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) { - TIMESCOPE(); + //TIMESCOPE(); auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle; int64_t Size = MapperComponentsPtr->Components.size(); DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n", @@ -423,7 +424,7 @@ EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) { EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base, void *Begin, int64_t Size, int64_t Type, void *Name) { - TIMESCOPE(); + //TIMESCOPE(); DP("__tgt_push_mapper_component(Handle=" DPxMOD ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 ", Type=0x%" PRIx64 ", Name=%s).\n", diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp index 40419e448942608..3754f63909dac9c 100644 --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -398,7 +398,7 @@ static int32_t getParentIndex(int64_t Type) { void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, const char *Name) { - TIMESCOPE(); + //TIMESCOPE(); DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size); if (Size <= 0) { @@ -427,7 +427,7 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, const char *Name) { - TIMESCOPE(); + //TIMESCOPE(); DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum, DPxPTR(DevicePtr)); @@ -453,7 +453,7 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum, const char *Name) { - TIMESCOPE(); + //TIMESCOPE(); DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size); if (Size <= 0) { @@ -493,7 +493,7 @@ void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum, } void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) { - TIMESCOPE(); + //TIMESCOPE(); DP("Call to %s for device %d unlocking\n", Name, DeviceNum); DeviceTy *DevicePtr = nullptr; @@ -572,7 +572,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { - TIMESCOPE_WITH_IDENT(Loc); + //TIMESCOPE_WITH_IDENT(Loc); // process each input. for (int32_t I = 0; I < ArgNum; ++I) { // Ignore private variables and arrays - there is no mapping for them. diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h index dc6cd3944233955..b1ada09d64c7a55 100644 --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -433,8 +433,9 @@ class ExponentialBackoff { SourceInfo SI(IDENT); \ std::string ProfileLocation = SI.getProfileLocation(); \ std::string RTM = RegionTypeMsg; \ - llvm::TimeTraceScope TimeScope(ProfileLocation, ProfileLocation + RTM) - //llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM) + llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM) + //llvm::TimeTraceScope TimeScope(ProfileLocation, ProfileLocation + RTM) + #else #define TIMESCOPE() #define TIMESCOPE_WITH_IDENT(IDENT) >From f9167dc8fef277ac1aa53e2e95bade3f0b727df1 Mon Sep 17 00:00:00 2001 From: Felipe Cabarcas <cabar...@leia.crpl.cis.udel.edu> Date: Tue, 19 Sep 2023 21:33:24 +0000 Subject: [PATCH 3/9] Changed profiling to work in nanoseconds. Made Profiling calls for runtime calls and different ones for kernel lunches and memory transfers. --- llvm/lib/Support/TimeProfiler.cpp | 28 +++++++++++++-------------- openmp/libomptarget/src/interface.cpp | 7 ++----- openmp/libomptarget/src/omptarget.cpp | 11 +++++++---- openmp/libomptarget/src/private.h | 6 +++--- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp index e1458116f64ab47..64b3ef35be27c42 100644 --- a/llvm/lib/Support/TimeProfiler.cpp +++ b/llvm/lib/Support/TimeProfiler.cpp @@ -30,7 +30,7 @@ namespace { using std::chrono::duration; using std::chrono::duration_cast; -using std::chrono::microseconds; +using std::chrono::nanoseconds; using std::chrono::steady_clock; using std::chrono::system_clock; using std::chrono::time_point; @@ -80,14 +80,14 @@ struct TimeTraceProfilerEntry { // rather than casting duration. This avoids truncation issues causing inner // scopes overruning outer scopes. ClockType::rep getFlameGraphStartUs(TimePointType StartTime) const { - return (time_point_cast<microseconds>(Start) - - time_point_cast<microseconds>(StartTime)) + return (time_point_cast<nanoseconds>(Start) - + time_point_cast<nanoseconds>(StartTime)) .count(); } ClockType::rep getFlameGraphDurUs() const { - return (time_point_cast<microseconds>(End) - - time_point_cast<microseconds>(Start)) + return (time_point_cast<nanoseconds>(End) - + time_point_cast<nanoseconds>(Start)) .count(); } }; @@ -123,7 +123,7 @@ struct llvm::TimeTraceProfiler { DurationType Duration = E.End - E.Start; // Only include sections longer or equal to TimeTraceGranularity msec. - if (duration_cast<microseconds>(Duration).count() >= TimeTraceGranularity) + if (duration_cast<nanoseconds>(Duration).count() >= TimeTraceGranularity) Entries.emplace_back(E); // Track total time taken by each "name", but only the topmost levels of @@ -169,8 +169,8 @@ struct llvm::TimeTraceProfiler { J.attribute("pid", Pid); J.attribute("tid", int64_t(Tid)); J.attribute("ph", "X"); - J.attribute("ts", StartUs); - J.attribute("dur", DurUs); + J.attribute("ts", StartUs / 1000); + J.attribute("dur", DurUs / 1000); J.attribute("name", E.Name); if (!E.Detail.empty()) { J.attributeObject("args", [&] { J.attribute("detail", E.Detail); }); @@ -218,7 +218,7 @@ struct llvm::TimeTraceProfiler { // Report totals on separate threads of tracing file. uint64_t TotalTid = MaxTid + 1; for (const NameAndCountAndDurationType &Total : SortedTotals) { - auto DurUs = duration_cast<microseconds>(Total.second.second).count(); + auto DurUs = duration_cast<nanoseconds>(Total.second.second).count(); auto Count = AllCountAndTotalPerName[Total.first].first; J.object([&] { @@ -226,11 +226,11 @@ struct llvm::TimeTraceProfiler { J.attribute("tid", int64_t(TotalTid)); J.attribute("ph", "X"); J.attribute("ts", 0); - J.attribute("dur", DurUs); + J.attribute("dur", DurUs / 1000); J.attribute("name", "Total: " + Total.first); J.attributeObject("args", [&] { J.attribute("count", int64_t(Count)); - J.attribute("avg ms", int64_t(DurUs / Count / 1000)); + J.attribute("avg ms", int64_t(DurUs / Count / 1000 / 1000)); }); }); @@ -262,9 +262,9 @@ struct llvm::TimeTraceProfiler { // This can be used to combine the profiling data from // multiple processes and preserve actual time intervals. J.attribute("beginningOfTime", - time_point_cast<microseconds>(BeginningOfTime) + time_point_cast<nanoseconds>(BeginningOfTime) .time_since_epoch() - .count()); + .count()/1000); J.objectEnd(); } @@ -281,7 +281,7 @@ struct llvm::TimeTraceProfiler { SmallString<0> ThreadName; const uint64_t Tid; - // Minimum time granularity (in microseconds) + // Minimum time granularity (in nanoseconds) const unsigned TimeTraceGranularity; }; diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp index b8892cbe689107f..d4ee246f84449f1 100644 --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -83,7 +83,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, "TargetAsyncInfoTy must be convertible to AsyncInfoTy."); //TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc); - TIMESCOPE_WITH_RTM_AND_IDENT("targetData", Loc); + TIMESCOPE_WITH_RTM_AND_IDENT("Runtime Data Copy", Loc); DP("Entering data %s region for device %" PRId64 " with %d mappings\n", RegionName, DeviceId, ArgNum); @@ -253,10 +253,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>, "Target AsyncInfoTy must be convertible to AsyncInfoTy."); - //TIMESCOPE_WITH_IDENT(Loc); - //TIMESCOPE(); - //TIMESCOPE_WITH_NAME_AND_IDENT("Hello", Loc); - //TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc); + TIMESCOPE_WITH_NAME_AND_IDENT("Runtime target exe",Loc); DP("Entering target region for device %" PRId64 " with entry point " DPxMOD "\n", diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp index 3754f63909dac9c..ad966e7e1c47544 100644 --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -863,6 +863,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { + //TIMESCOPE_WITH_NAME_AND_IDENT("targetDataEnd", Loc); int Ret = OFFLOAD_SUCCESS; auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>(); // process each input. @@ -955,7 +956,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, !TPR.Flags.IsHostPointer && DataSize != 0) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - + TIMESCOPE_WITH_NAME_AND_IDENT("DevToHost", Loc); // Wait for any previous transfer if an event is present. if (void *Event = TPR.getEntry()->getEvent()) { if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) { @@ -1445,7 +1446,7 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, SmallVector<ptrdiff_t> &TgtOffsets, PrivateArgumentManagerTy &PrivateArgumentManager, AsyncInfoTy &AsyncInfo) { - TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc); + //TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc); DeviceTy &Device = *PM->Devices[DeviceId]; int Ret = targetDataBegin(Loc, Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, AsyncInfo); @@ -1493,6 +1494,7 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, DPxPTR(HstPtrVal)); continue; } + TIMESCOPE_WITH_RTM_AND_IDENT("HostToDev", Loc); DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin, @@ -1572,7 +1574,7 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr, map_var_info_t *ArgNames, void **ArgMappers, PrivateArgumentManagerTy &PrivateArgumentManager, AsyncInfoTy &AsyncInfo) { - TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc); + //TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc); DeviceTy &Device = *PM->Devices[DeviceId]; // Move data from device. @@ -1597,6 +1599,7 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr, return Ret; }); + return OFFLOAD_SUCCESS; } } // namespace @@ -1672,7 +1675,7 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, { assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!"); - TIMESCOPE_WITH_NAME_AND_IDENT("Initiate Kernel Launch", Loc); + TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc); #ifdef OMPT_SUPPORT assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 && diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h index b1ada09d64c7a55..f0591cd17b0fd15 100644 --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -432,10 +432,10 @@ class ExponentialBackoff { #define TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, IDENT) \ SourceInfo SI(IDENT); \ std::string ProfileLocation = SI.getProfileLocation(); \ + std::string ProfileName = SI.getName(); \ std::string RTM = RegionTypeMsg; \ - llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM) - //llvm::TimeTraceScope TimeScope(ProfileLocation, ProfileLocation + RTM) - + llvm::TimeTraceScope TimeScope(ProfileName, ProfileLocation + RTM) + //llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM) #else #define TIMESCOPE() #define TIMESCOPE_WITH_IDENT(IDENT) >From c82ce52f244d218752fea2dcc1f347fc589cd016 Mon Sep 17 00:00:00 2001 From: Felipe Cabarcas <cabar...@leia.crpl.cis.udel.edu> Date: Thu, 21 Sep 2023 14:22:28 +0000 Subject: [PATCH 4/9] test with DevToHost --- openmp/libomptarget/src/omptarget.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp index ad966e7e1c47544..e113942375ef9c6 100644 --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -956,7 +956,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, !TPR.Flags.IsHostPointer && DataSize != 0) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - TIMESCOPE_WITH_NAME_AND_IDENT("DevToHost", Loc); + std::string MessageDataSize = "DevToHost "+std::to_string(DataSize)+"B"; + TIMESCOPE_WITH_NAME_AND_IDENT(MessageDataSize, Loc); // Wait for any previous transfer if an event is present. if (void *Event = TPR.getEntry()->getEvent()) { if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) { >From 448f0e77b6c824de73cbd9ae34d4c59b02e7e441 Mon Sep 17 00:00:00 2001 From: Felipe Cabarcas <cabar...@leia.crpl.cis.udel.edu> Date: Fri, 22 Sep 2023 21:48:57 +0000 Subject: [PATCH 5/9] Fixing nanoseconds in totals, adding syncronize timings, and adding extra info in kernels and device --- llvm/lib/Support/TimeProfiler.cpp | 24 ++++++++++++------------ openmp/libomptarget/src/interface.cpp | 18 ++++++++++-------- openmp/libomptarget/src/omptarget.cpp | 19 +++++++++---------- openmp/libomptarget/src/private.h | 10 +++++++--- 4 files changed, 38 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp index 64b3ef35be27c42..4446583102a8133 100644 --- a/llvm/lib/Support/TimeProfiler.cpp +++ b/llvm/lib/Support/TimeProfiler.cpp @@ -79,13 +79,13 @@ struct TimeTraceProfilerEntry { // Calculate timings for FlameGraph. Cast time points to microsecond precision // rather than casting duration. This avoids truncation issues causing inner // scopes overruning outer scopes. - ClockType::rep getFlameGraphStartUs(TimePointType StartTime) const { + ClockType::rep getFlameGraphStartNs(TimePointType StartTime) const { return (time_point_cast<nanoseconds>(Start) - time_point_cast<nanoseconds>(StartTime)) .count(); } - ClockType::rep getFlameGraphDurUs() const { + ClockType::rep getFlameGraphDurNs() const { return (time_point_cast<nanoseconds>(End) - time_point_cast<nanoseconds>(Start)) .count(); @@ -114,9 +114,9 @@ struct llvm::TimeTraceProfiler { // Check that end times monotonically increase. assert((Entries.empty() || - (E.getFlameGraphStartUs(StartTime) + E.getFlameGraphDurUs() >= - Entries.back().getFlameGraphStartUs(StartTime) + - Entries.back().getFlameGraphDurUs())) && + (E.getFlameGraphStartNs(StartTime) + E.getFlameGraphDurNs() >= + Entries.back().getFlameGraphStartNs(StartTime) + + Entries.back().getFlameGraphDurNs())) && "TimeProfiler scope ended earlier than previous scope"); // Calculate duration at full precision for overall counts. @@ -162,15 +162,15 @@ struct llvm::TimeTraceProfiler { // Emit all events for the main flame graph. auto writeEvent = [&](const auto &E, uint64_t Tid) { - auto StartUs = E.getFlameGraphStartUs(StartTime); - auto DurUs = E.getFlameGraphDurUs(); + auto StartNs = E.getFlameGraphStartNs(StartTime); + auto DurNs = E.getFlameGraphDurNs(); J.object([&] { J.attribute("pid", Pid); J.attribute("tid", int64_t(Tid)); J.attribute("ph", "X"); - J.attribute("ts", StartUs / 1000); - J.attribute("dur", DurUs / 1000); + J.attribute("ts", StartNs / 1000); + J.attribute("dur", DurNs / 1000); J.attribute("name", E.Name); if (!E.Detail.empty()) { J.attributeObject("args", [&] { J.attribute("detail", E.Detail); }); @@ -218,7 +218,7 @@ struct llvm::TimeTraceProfiler { // Report totals on separate threads of tracing file. uint64_t TotalTid = MaxTid + 1; for (const NameAndCountAndDurationType &Total : SortedTotals) { - auto DurUs = duration_cast<nanoseconds>(Total.second.second).count(); + auto DurNs = duration_cast<nanoseconds>(Total.second.second).count(); auto Count = AllCountAndTotalPerName[Total.first].first; J.object([&] { @@ -226,11 +226,11 @@ struct llvm::TimeTraceProfiler { J.attribute("tid", int64_t(TotalTid)); J.attribute("ph", "X"); J.attribute("ts", 0); - J.attribute("dur", DurUs / 1000); + J.attribute("dur", DurNs / 1000 ); J.attribute("name", "Total: " + Total.first); J.attributeObject("args", [&] { J.attribute("count", int64_t(Count)); - J.attribute("avg ms", int64_t(DurUs / Count / 1000 / 1000)); + J.attribute("avg us", int64_t(DurNs / Count / 1000)); }); }); diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp index d4ee246f84449f1..bed9b1e40db455b 100644 --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -83,7 +83,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, "TargetAsyncInfoTy must be convertible to AsyncInfoTy."); //TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc); - TIMESCOPE_WITH_RTM_AND_IDENT("Runtime Data Copy", Loc); + TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy","NumArgs="+std::to_string(ArgNum), Loc); DP("Entering data %s region for device %" PRId64 " with %d mappings\n", RegionName, DeviceId, ArgNum); @@ -252,9 +252,6 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, KernelArgsTy *KernelArgs) { static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>, "Target AsyncInfoTy must be convertible to AsyncInfoTy."); - - TIMESCOPE_WITH_NAME_AND_IDENT("Runtime target exe",Loc); - DP("Entering target region for device %" PRId64 " with entry point " DPxMOD "\n", DeviceId, DPxPTR(HostPtr)); @@ -279,7 +276,11 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, assert(KernelArgs->ThreadLimit[0] == static_cast<uint32_t>(ThreadLimit) && !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && "OpenMP interface should not use multiple dimensions"); - + TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime target exe", + "NumTeams="+std::to_string(NumTeams)+ + ";NumArgs="+std::to_string(KernelArgs->NumArgs) + , Loc); + if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs, KernelArgs->ArgSizes, KernelArgs->ArgTypes, @@ -303,16 +304,17 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, OMPT_IF_BUILT(InterfaceRAII TargetRAII( RegionInterface.getCallbacks<ompt_target>(), DeviceId, /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));) - + int Rc = OFFLOAD_SUCCESS; Rc = target(Loc, Device, HostPtr, *KernelArgs, AsyncInfo); - + { + TIMESCOPE_WITH_RTM_AND_IDENT("syncronize", Loc); if (Rc == OFFLOAD_SUCCESS) Rc = AsyncInfo.synchronize(); handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!"); - + } return OMP_TGT_SUCCESS; } diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp index e113942375ef9c6..5f6168b0bd2fca0 100644 --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -579,7 +579,7 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) continue; - + TIMESCOPE_WITH_DETAILS_AND_IDENT("HostToDev","Size="+std::to_string(ArgSizes[I])+"B", Loc); if (ArgMappers && ArgMappers[I]) { // Instead of executing the regular path of targetDataBegin, call the // targetDataMapper variant which will call targetDataBegin again @@ -863,7 +863,6 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { - //TIMESCOPE_WITH_NAME_AND_IDENT("targetDataEnd", Loc); int Ret = OFFLOAD_SUCCESS; auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>(); // process each input. @@ -956,8 +955,7 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, !TPR.Flags.IsHostPointer && DataSize != 0) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - std::string MessageDataSize = "DevToHost "+std::to_string(DataSize)+"B"; - TIMESCOPE_WITH_NAME_AND_IDENT(MessageDataSize, Loc); + TIMESCOPE_WITH_DETAILS_AND_IDENT("DevToHost","Size="+std::to_string(DataSize)+"B", Loc); // Wait for any previous transfer if an event is present. if (void *Event = TPR.getEntry()->getEvent()) { if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) { @@ -1447,7 +1445,6 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, SmallVector<ptrdiff_t> &TgtOffsets, PrivateArgumentManagerTy &PrivateArgumentManager, AsyncInfoTy &AsyncInfo) { - //TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc); DeviceTy &Device = *PM->Devices[DeviceId]; int Ret = targetDataBegin(Loc, Device, ArgNum, ArgBases, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, AsyncInfo); @@ -1494,8 +1491,7 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, "variable (" DPxMOD ")\n", DPxPTR(HstPtrVal)); continue; - } - TIMESCOPE_WITH_RTM_AND_IDENT("HostToDev", Loc); + } DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin, @@ -1575,7 +1571,6 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr, map_var_info_t *ArgNames, void **ArgMappers, PrivateArgumentManagerTy &PrivateArgumentManager, AsyncInfoTy &AsyncInfo) { - //TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc); DeviceTy &Device = *PM->Devices[DeviceId]; // Move data from device. @@ -1676,8 +1671,12 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, { assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!"); - TIMESCOPE_WITH_RTM_AND_IDENT("Kernel", Loc); - + TIMESCOPE_WITH_DETAILS_AND_IDENT("Kernel Target", + "NumArguments="+std::to_string(KernelArgs.NumArgs) + +";NumTeams="+std::to_string(KernelArgs.NumTeams[0]) + +";TripCount="+std::to_string(KernelArgs.Tripcount) + , Loc); + #ifdef OMPT_SUPPORT assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 && "Multi dimensional launch not supported yet."); diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h index f0591cd17b0fd15..4bc1db79de3f2b7 100644 --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -432,14 +432,18 @@ class ExponentialBackoff { #define TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, IDENT) \ SourceInfo SI(IDENT); \ std::string ProfileLocation = SI.getProfileLocation(); \ - std::string ProfileName = SI.getName(); \ + std::string ProfileName = SI.getName(); \ std::string RTM = RegionTypeMsg; \ - llvm::TimeTraceScope TimeScope(ProfileName, ProfileLocation + RTM) + llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + RTM) //llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM) +#define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT) \ + SourceInfo SI(IDENT); \ + std::string ProfileLocation = SI.getProfileLocation(); \ + llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + Details) #else #define TIMESCOPE() #define TIMESCOPE_WITH_IDENT(IDENT) #define TIMESCOPE_WITH_NAME_AND_IDENT(NAME, IDENT) #define TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, IDENT) - +#define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT) #endif >From c8bb24e807324a6a42b50076e5a3d2159f1d6d74 Mon Sep 17 00:00:00 2001 From: Felipe Cabarcas <cabar...@leia.crpl.cis.udel.edu> Date: Tue, 26 Sep 2023 15:58:50 +0000 Subject: [PATCH 6/9] Some fixes to the profiler --- openmp/libomptarget/src/api.cpp | 7 +++++++ openmp/libomptarget/src/interface.cpp | 16 +++++----------- openmp/libomptarget/src/omptarget.cpp | 5 ----- 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp index 942df8fdb94d660..f628a64c5b69fa4 100644 --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -50,6 +50,7 @@ EXTERN int omp_get_initial_device(void) { } EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) { + TIMESCOPE(); return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__); } @@ -66,6 +67,7 @@ EXTERN void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum) { } EXTERN void omp_target_free(void *Ptr, int DeviceNum) { + TIMESCOPE(); return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_DEFAULT, __func__); } @@ -134,6 +136,11 @@ EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length, size_t DstOffset, size_t SrcOffset, int DstDevice, int SrcDevice) { TIMESCOPE(); + /*TIMESCOPE_WITH_DETAILS_AND_IDENT("omp_target_memcpy", + "NumArguments="+std::to_string(KernelArgs.NumArgs) + +";NumTeams="+std::to_string(KernelArgs.NumTeams[0]) + +";TripCount="+std::to_string(KernelArgs.Tripcount) + , __FUNCTION__);*/ DP("Call to omp_target_memcpy, dst device %d, src device %d, " "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, " "src offset %zu, length %zu\n", diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp index bed9b1e40db455b..61a340ccf8d1b10 100644 --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -33,14 +33,12 @@ using namespace llvm::omp::target::ompt; //////////////////////////////////////////////////////////////////////////////// /// adds requires flags EXTERN void __tgt_register_requires(int64_t Flags) { - //TIMESCOPE(); PM->RTLs.registerRequires(Flags); } //////////////////////////////////////////////////////////////////////////////// /// adds a target shared library to the target execution image EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) { - //TIMESCOPE(); if (PM->maybeDelayRegisterLib(Desc)) return; @@ -61,7 +59,6 @@ EXTERN void __tgt_init_all_rtls() { PM->RTLs.initAllRTLs(); } //////////////////////////////////////////////////////////////////////////////// /// unloads a target shared library EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) { - //TIMESCOPE(); PM->RTLs.unregisterLib(Desc); for (auto &RTL : PM->RTLs.UsedRTLs) { if (RTL->unregister_lib) { @@ -82,7 +79,6 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>, "TargetAsyncInfoTy must be convertible to AsyncInfoTy."); - //TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc); TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy","NumArgs="+std::to_string(ArgNum), Loc); DP("Entering data %s region for device %" PRId64 " with %d mappings\n", @@ -307,13 +303,13 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, int Rc = OFFLOAD_SUCCESS; Rc = target(Loc, Device, HostPtr, *KernelArgs, AsyncInfo); - { + { //required to show syncronization TIMESCOPE_WITH_RTM_AND_IDENT("syncronize", Loc); - if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + if (Rc == OFFLOAD_SUCCESS) + Rc = AsyncInfo.synchronize(); - handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); - assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!"); + handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); + assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!"); } return OMP_TGT_SUCCESS; } @@ -411,7 +407,6 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, // Get the current number of components for a user-defined mapper. EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) { - //TIMESCOPE(); auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle; int64_t Size = MapperComponentsPtr->Components.size(); DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n", @@ -423,7 +418,6 @@ EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) { EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base, void *Begin, int64_t Size, int64_t Type, void *Name) { - //TIMESCOPE(); DP("__tgt_push_mapper_component(Handle=" DPxMOD ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 ", Type=0x%" PRIx64 ", Name=%s).\n", diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp index 5f6168b0bd2fca0..450f34894fb56b4 100644 --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -398,7 +398,6 @@ static int32_t getParentIndex(int64_t Type) { void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, const char *Name) { - //TIMESCOPE(); DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size); if (Size <= 0) { @@ -427,7 +426,6 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, const char *Name) { - //TIMESCOPE(); DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum, DPxPTR(DevicePtr)); @@ -453,7 +451,6 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum, const char *Name) { - //TIMESCOPE(); DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size); if (Size <= 0) { @@ -493,7 +490,6 @@ void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum, } void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) { - //TIMESCOPE(); DP("Call to %s for device %d unlocking\n", Name, DeviceNum); DeviceTy *DevicePtr = nullptr; @@ -572,7 +568,6 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { - //TIMESCOPE_WITH_IDENT(Loc); // process each input. for (int32_t I = 0; I < ArgNum; ++I) { // Ignore private variables and arrays - there is no mapping for them. >From da71cf17918c56e6a64c1e966dbb5d0dd79d0ed9 Mon Sep 17 00:00:00 2001 From: Felipe Cabarcas <cabar...@leia.crpl.cis.udel.edu> Date: Tue, 26 Sep 2023 21:06:06 +0000 Subject: [PATCH 7/9] Adding information to some omp api calls --- openmp/libomptarget/src/api.cpp | 22 ++++++++++++---------- openmp/libomptarget/src/private.h | 7 +++++-- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp index f628a64c5b69fa4..5dd918808492997 100644 --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -50,7 +50,8 @@ EXTERN int omp_get_initial_device(void) { } EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) { - TIMESCOPE(); + TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DeviceNum) + +";size="+std::to_string(Size)); return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__); } @@ -135,12 +136,9 @@ EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) { EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length, size_t DstOffset, size_t SrcOffset, int DstDevice, int SrcDevice) { - TIMESCOPE(); - /*TIMESCOPE_WITH_DETAILS_AND_IDENT("omp_target_memcpy", - "NumArguments="+std::to_string(KernelArgs.NumArgs) - +";NumTeams="+std::to_string(KernelArgs.NumTeams[0]) - +";TripCount="+std::to_string(KernelArgs.Tripcount) - , __FUNCTION__);*/ + TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice) + +";src_dev="+std::to_string(SrcDevice) + +";size="+std::to_string(Length)); DP("Call to omp_target_memcpy, dst device %d, src device %d, " "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, " "src offset %zu, length %zu\n", @@ -293,7 +291,9 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length, size_t DstOffset, size_t SrcOffset, int DstDevice, int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) { - TIMESCOPE(); + TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice) + +";src_dev="+std::to_string(SrcDevice) + +";size="+std::to_string(Length)); DP("Call to omp_target_memcpy_async, dst device %d, src device %d, " "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, " "src offset %zu, length %zu\n", @@ -321,7 +321,6 @@ omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize, const size_t *DstOffsets, const size_t *SrcOffsets, const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice, int SrcDevice) { - TIMESCOPE(); DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, " "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", " "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", " @@ -380,7 +379,10 @@ EXTERN int omp_target_memcpy_rect_async( const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets, const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice, int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) { - TIMESCOPE(); + TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice) + +";src_dev="+std::to_string(SrcDevice) + +";size="+std::to_string(ElementSize) + +";num_dims="+std::to_string(NumDims)); DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, " "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", " "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", " diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h index 4bc1db79de3f2b7..c8d07138b180d17 100644 --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -434,16 +434,19 @@ class ExponentialBackoff { std::string ProfileLocation = SI.getProfileLocation(); \ std::string ProfileName = SI.getName(); \ std::string RTM = RegionTypeMsg; \ - llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + RTM) - //llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM) + llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM) + //llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + RTM) #define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT) \ SourceInfo SI(IDENT); \ std::string ProfileLocation = SI.getProfileLocation(); \ llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + Details) +#define TIMESCOPE_WITH_DETAILS(Details) \ + llvm::TimeTraceScope TimeScope(__FUNCTION__, Details) #else #define TIMESCOPE() #define TIMESCOPE_WITH_IDENT(IDENT) #define TIMESCOPE_WITH_NAME_AND_IDENT(NAME, IDENT) #define TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, IDENT) #define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT) +#define TIMESCOPE_WITH_DETAILS(Details) #endif >From f273bbcc66f361fe9cc03d8597ee886122b5e235 Mon Sep 17 00:00:00 2001 From: fel-cab <fel-...@github.com> Date: Mon, 2 Oct 2023 12:26:51 +0000 Subject: [PATCH 8/9] Adding information to the LIBOMPTARGET profiler runtime kernel and API calls. --- openmp/libomptarget/src/interface.cpp | 14 ++++++++------ openmp/libomptarget/src/omptarget.cpp | 24 +++++++++++++++--------- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp index 61a340ccf8d1b10..99a7abc7e0bcee9 100644 --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -79,7 +79,9 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>, "TargetAsyncInfoTy must be convertible to AsyncInfoTy."); - TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy","NumArgs="+std::to_string(ArgNum), Loc); + TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy", + "NumArgs="+ + std::to_string(ArgNum), Loc); DP("Entering data %s region for device %" PRId64 " with %d mappings\n", RegionName, DeviceId, ArgNum); @@ -273,10 +275,10 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && "OpenMP interface should not use multiple dimensions"); TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime target exe", - "NumTeams="+std::to_string(NumTeams)+ - ";NumArgs="+std::to_string(KernelArgs->NumArgs) - , Loc); - + "NumTeams="+std::to_string(NumTeams)+ + ";NumArgs="+ + std::to_string(KernelArgs->NumArgs), Loc); + if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs, KernelArgs->ArgSizes, KernelArgs->ArgTypes, @@ -300,7 +302,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, OMPT_IF_BUILT(InterfaceRAII TargetRAII( RegionInterface.getCallbacks<ompt_target>(), DeviceId, /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));) - + int Rc = OFFLOAD_SUCCESS; Rc = target(Loc, Device, HostPtr, *KernelArgs, AsyncInfo); { //required to show syncronization diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp index 450f34894fb56b4..b5a2dfc68569081 100644 --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -574,7 +574,10 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) continue; - TIMESCOPE_WITH_DETAILS_AND_IDENT("HostToDev","Size="+std::to_string(ArgSizes[I])+"B", Loc); + TIMESCOPE_WITH_DETAILS_AND_IDENT("HostToDev", + "Size="+ + std::to_string(ArgSizes[I])+ + "B", Loc); if (ArgMappers && ArgMappers[I]) { // Instead of executing the regular path of targetDataBegin, call the // targetDataMapper variant which will call targetDataBegin again @@ -950,7 +953,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, !TPR.Flags.IsHostPointer && DataSize != 0) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - TIMESCOPE_WITH_DETAILS_AND_IDENT("DevToHost","Size="+std::to_string(DataSize)+"B", Loc); + TIMESCOPE_WITH_DETAILS_AND_IDENT("DevToHost","Size="+ + std::to_string(DataSize)+"B", Loc); // Wait for any previous transfer if an event is present. if (void *Event = TPR.getEntry()->getEvent()) { if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) { @@ -1486,7 +1490,7 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, "variable (" DPxMOD ")\n", DPxPTR(HstPtrVal)); continue; - } + } DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin, @@ -1590,7 +1594,6 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr, return Ret; }); - return OFFLOAD_SUCCESS; } } // namespace @@ -1667,11 +1670,14 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, { assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!"); TIMESCOPE_WITH_DETAILS_AND_IDENT("Kernel Target", - "NumArguments="+std::to_string(KernelArgs.NumArgs) - +";NumTeams="+std::to_string(KernelArgs.NumTeams[0]) - +";TripCount="+std::to_string(KernelArgs.Tripcount) - , Loc); - + "NumArguments="+ + std::to_string(KernelArgs.NumArgs)+ + ";NumTeams="+ + std::to_string(KernelArgs.NumTeams[0])+ + ";TripCount="+ + std::to_string(KernelArgs.Tripcount) + , Loc); + #ifdef OMPT_SUPPORT assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 && "Multi dimensional launch not supported yet."); >From 08dbdd5ba1c0502b6d1c935bac6cc14acd4f04be Mon Sep 17 00:00:00 2001 From: fel-cab <fel-...@github.com> Date: Mon, 2 Oct 2023 19:14:01 +0000 Subject: [PATCH 9/9] Fixing format --- llvm/lib/Support/TimeProfiler.cpp | 10 +++---- openmp/libomptarget/src/api.cpp | 24 ++++++++--------- openmp/libomptarget/src/interface.cpp | 14 +++++----- openmp/libomptarget/src/omptarget.cpp | 39 ++++++++++++--------------- openmp/libomptarget/src/private.h | 2 +- 5 files changed, 42 insertions(+), 47 deletions(-) diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp index 4446583102a8133..330a4d93378affe 100644 --- a/llvm/lib/Support/TimeProfiler.cpp +++ b/llvm/lib/Support/TimeProfiler.cpp @@ -226,7 +226,7 @@ struct llvm::TimeTraceProfiler { J.attribute("tid", int64_t(TotalTid)); J.attribute("ph", "X"); J.attribute("ts", 0); - J.attribute("dur", DurNs / 1000 ); + J.attribute("dur", DurNs / 1000); J.attribute("name", "Total: " + Total.first); J.attributeObject("args", [&] { J.attribute("count", int64_t(Count)); @@ -261,10 +261,10 @@ struct llvm::TimeTraceProfiler { // Emit the absolute time when this TimeProfiler started. // This can be used to combine the profiling data from // multiple processes and preserve actual time intervals. - J.attribute("beginningOfTime", - time_point_cast<nanoseconds>(BeginningOfTime) - .time_since_epoch() - .count()/1000); + J.attribute("beginningOfTime", time_point_cast<nanoseconds>(BeginningOfTime) + .time_since_epoch() + .count() / + 1000); J.objectEnd(); } diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp index 5dd918808492997..06de1f8f20b7ae2 100644 --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -50,8 +50,8 @@ EXTERN int omp_get_initial_device(void) { } EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) { - TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DeviceNum) - +";size="+std::to_string(Size)); + TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) + + ";size=" + std::to_string(Size)); return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__); } @@ -136,9 +136,9 @@ EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) { EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length, size_t DstOffset, size_t SrcOffset, int DstDevice, int SrcDevice) { - TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice) - +";src_dev="+std::to_string(SrcDevice) - +";size="+std::to_string(Length)); + TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) + + ";src_dev=" + std::to_string(SrcDevice) + + ";size=" + std::to_string(Length)); DP("Call to omp_target_memcpy, dst device %d, src device %d, " "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, " "src offset %zu, length %zu\n", @@ -291,9 +291,9 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length, size_t DstOffset, size_t SrcOffset, int DstDevice, int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) { - TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice) - +";src_dev="+std::to_string(SrcDevice) - +";size="+std::to_string(Length)); + TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) + + ";src_dev=" + std::to_string(SrcDevice) + + ";size=" + std::to_string(Length)); DP("Call to omp_target_memcpy_async, dst device %d, src device %d, " "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, " "src offset %zu, length %zu\n", @@ -379,10 +379,10 @@ EXTERN int omp_target_memcpy_rect_async( const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets, const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice, int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) { - TIMESCOPE_WITH_DETAILS("dst_dev="+std::to_string(DstDevice) - +";src_dev="+std::to_string(SrcDevice) - +";size="+std::to_string(ElementSize) - +";num_dims="+std::to_string(NumDims)); + TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) + + ";src_dev=" + std::to_string(SrcDevice) + + ";size=" + std::to_string(ElementSize) + + ";num_dims=" + std::to_string(NumDims)); DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, " "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", " "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", " diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp index 99a7abc7e0bcee9..2c7ab7a49d0bfb0 100644 --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -80,8 +80,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, "TargetAsyncInfoTy must be convertible to AsyncInfoTy."); TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy", - "NumArgs="+ - std::to_string(ArgNum), Loc); + "NumArgs=" + std::to_string(ArgNum), Loc); DP("Entering data %s region for device %" PRId64 " with %d mappings\n", RegionName, DeviceId, ArgNum); @@ -274,10 +273,11 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, assert(KernelArgs->ThreadLimit[0] == static_cast<uint32_t>(ThreadLimit) && !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && "OpenMP interface should not use multiple dimensions"); - TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime target exe", - "NumTeams="+std::to_string(NumTeams)+ - ";NumArgs="+ - std::to_string(KernelArgs->NumArgs), Loc); + TIMESCOPE_WITH_DETAILS_AND_IDENT( + "Runtime target exe", + "NumTeams=" + std::to_string(NumTeams) + + ";NumArgs=" + std::to_string(KernelArgs->NumArgs), + Loc); if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs, @@ -305,7 +305,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, int Rc = OFFLOAD_SUCCESS; Rc = target(Loc, Device, HostPtr, *KernelArgs, AsyncInfo); - { //required to show syncronization + { // required to show syncronization TIMESCOPE_WITH_RTM_AND_IDENT("syncronize", Loc); if (Rc == OFFLOAD_SUCCESS) Rc = AsyncInfo.synchronize(); diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp index b5a2dfc68569081..277f95d7efa8201 100644 --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -574,10 +574,8 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) continue; - TIMESCOPE_WITH_DETAILS_AND_IDENT("HostToDev", - "Size="+ - std::to_string(ArgSizes[I])+ - "B", Loc); + TIMESCOPE_WITH_DETAILS_AND_IDENT( + "HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc); if (ArgMappers && ArgMappers[I]) { // Instead of executing the regular path of targetDataBegin, call the // targetDataMapper variant which will call targetDataBegin again @@ -825,14 +823,13 @@ postProcessingTargetDataEnd(DeviceTy *Device, // remaining shadow pointer entries for this struct. const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM; if (HasFrom) { - Entry->foreachShadowPointerInfo( - [&](const ShadowPtrInfoTy &ShadowPtr) { - *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal; - DP("Restoring original host pointer value " DPxMOD " for host " - "pointer " DPxMOD "\n", - DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr)); - return OFFLOAD_SUCCESS; - }); + Entry->foreachShadowPointerInfo([&](const ShadowPtrInfoTy &ShadowPtr) { + *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal; + DP("Restoring original host pointer value " DPxMOD " for host " + "pointer " DPxMOD "\n", + DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr)); + return OFFLOAD_SUCCESS; + }); } // Give up the lock as we either don't need it anymore (e.g., done with @@ -953,8 +950,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, !TPR.Flags.IsHostPointer && DataSize != 0) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - TIMESCOPE_WITH_DETAILS_AND_IDENT("DevToHost","Size="+ - std::to_string(DataSize)+"B", Loc); + TIMESCOPE_WITH_DETAILS_AND_IDENT( + "DevToHost", "Size=" + std::to_string(DataSize) + "B", Loc); // Wait for any previous transfer if an event is present. if (void *Event = TPR.getEntry()->getEvent()) { if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) { @@ -1669,14 +1666,12 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, { assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!"); - TIMESCOPE_WITH_DETAILS_AND_IDENT("Kernel Target", - "NumArguments="+ - std::to_string(KernelArgs.NumArgs)+ - ";NumTeams="+ - std::to_string(KernelArgs.NumTeams[0])+ - ";TripCount="+ - std::to_string(KernelArgs.Tripcount) - , Loc); + TIMESCOPE_WITH_DETAILS_AND_IDENT( + "Kernel Target", + "NumArguments=" + std::to_string(KernelArgs.NumArgs) + + ";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) + + ";TripCount=" + std::to_string(KernelArgs.Tripcount), + Loc); #ifdef OMPT_SUPPORT assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 && diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h index c8d07138b180d17..8657390dde17dc1 100644 --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -435,7 +435,7 @@ class ExponentialBackoff { std::string ProfileName = SI.getName(); \ std::string RTM = RegionTypeMsg; \ llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM) - //llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + RTM) +// llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + RTM) #define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT) \ SourceInfo SI(IDENT); \ std::string ProfileLocation = SI.getProfileLocation(); \ _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits