https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/127736
>From fd8f342fa2b65f7604955c88e2b73e758dc17134 Mon Sep 17 00:00:00 2001 From: Alex Maclean <amacl...@nvidia.com> Date: Wed, 19 Feb 2025 02:26:23 +0000 Subject: [PATCH 1/2] [NVPTX] Convert vector function nvvm.annotations to attributes --- clang/lib/CodeGen/Targets/NVPTX.cpp | 8 +- clang/test/CodeGenCUDA/launch-bounds.cu | 59 +++++------- clang/test/OpenMP/ompx_attributes_codegen.cpp | 19 ++-- clang/test/OpenMP/thread_limit_nvptx.c | 18 ++-- llvm/docs/NVPTXUsage.rst | 17 ++++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 51 ++--------- llvm/lib/IR/AutoUpgrade.cpp | 45 ++++++++++ llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 42 ++++----- .../Target/NVPTX/NVPTXCtorDtorLowering.cpp | 29 +----- .../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 17 ++-- llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 90 +++++++++---------- llvm/lib/Target/NVPTX/NVPTXUtilities.h | 18 ++-- llvm/lib/Target/NVPTX/NVVMIntrRange.cpp | 4 +- .../KernelInfo/launch-bounds/nvptx.ll | 6 +- llvm/test/CodeGen/NVPTX/annotations.ll | 12 +-- llvm/test/CodeGen/NVPTX/bug26185-2.ll | 6 +- llvm/test/CodeGen/NVPTX/cluster-dim.ll | 7 +- llvm/test/CodeGen/NVPTX/intr-range.ll | 15 ++-- llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll | 9 +- llvm/test/CodeGen/NVPTX/maxclusterrank.ll | 8 +- .../CodeGen/NVPTX/upgrade-nvvm-annotations.ll | 62 ++++++++++++- .../Dialect/NVVM/NVVMToLLVMIRTranslation.cpp | 41 +++------ mlir/test/Target/LLVMIR/nvvmir.mlir | 27 ++---- 23 files changed, 296 insertions(+), 314 deletions(-) diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 98a90613a2d3e..f617e645a9eaf 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -357,17 +357,13 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F, int32_t *MaxThreadsVal, int32_t *MinBlocksVal, int32_t *MaxClusterRankVal) { - // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node llvm::APSInt MaxThreads(32); MaxThreads = Attr->getMaxThreads()->EvaluateKnownConstInt(getContext()); if (MaxThreads > 0) { if (MaxThreadsVal) *MaxThreadsVal = MaxThreads.getExtValue(); - if (F) { - // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node - NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx", - MaxThreads.getExtValue()); - } + if (F) + F->addFnAttr("nvvm.maxntid", llvm::utostr(MaxThreads.getExtValue())); } // min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it diff --git a/clang/test/CodeGenCUDA/launch-bounds.cu b/clang/test/CodeGenCUDA/launch-bounds.cu index 72f7857264f8c..fba66e85040c7 100644 --- a/clang/test/CodeGenCUDA/launch-bounds.cu +++ b/clang/test/CodeGenCUDA/launch-bounds.cu @@ -10,23 +10,30 @@ #endif // CHECK: @Kernel1() #[[ATTR0:[0-9]+]] +// CHECK: @Kernel2() #[[ATTR1:[0-9]+]] +// CHECK: @{{.*}}Kernel3{{.*}}() #[[ATTR1]] // CHECK: @{{.*}}Kernel4{{.*}}() #[[ATTR0]] -// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR1:[0-9]+]] -// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR0]] -// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR3:[0-9]+]] - -// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}} -// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.minctasm"="258" {{.*}}} -// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="12" {{.*}}} - -// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR4:[0-9]+]] -// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR4]] -// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR5:[0-9]+]] -// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR6:[0-9]+]] - -// CHECK_MAX_BLOCKS: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.minctasm"="2" {{.*}}} -// CHECK_MAX_BLOCKS: attributes #[[ATTR5]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.minctasm"="258" {{.*}}} -// CHECK_MAX_BLOCKS: attributes #[[ATTR6]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.minctasm"="12" {{.*}}} +// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR2:[0-9]+]] +// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR3:[0-9]+]] +// CHECK: @{{.*}}Kernel7{{.*}}() #[[ATTR1]] +// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR4:[0-9]+]] + +// CHECK-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}} +// CHECK-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}} +// CHECK-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}} +// CHECK-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}} +// CHECK-DAG: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}} + +// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR0:[0-9]+]] +// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR0]] +// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR1:[0-9]+]] +// CHECK_MAX_BLOCKS: @{{.*}}Kernel7_sm_90{{.*}} #[[ATTR2:[0-9]+]] +// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR3:[0-9]+]] + +// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}} +// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}} +// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}} +// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}} // Test both max threads per block and Min cta per sm. extern "C" { @@ -37,8 +44,6 @@ Kernel1() } } -// CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"maxntidx", i32 256} - #ifdef USE_MAX_BLOCKS // Test max threads per block and min/max cta per sm. extern "C" { @@ -48,8 +53,6 @@ Kernel1_sm_90() { } } - -// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxntidx", i32 256} #endif // USE_MAX_BLOCKS // Test only max threads per block. Min cta per sm defaults to 0, and @@ -62,8 +65,6 @@ Kernel2() } } -// CHECK: !{{[0-9]+}} = !{ptr @Kernel2, !"maxntidx", i32 256} - template <int max_threads_per_block> __global__ void __launch_bounds__(max_threads_per_block) @@ -72,7 +73,6 @@ Kernel3() } template __global__ void Kernel3<MAX_THREADS_PER_BLOCK>(); -// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel3{{.*}}, !"maxntidx", i32 256} template <int max_threads_per_block, int min_blocks_per_mp> __global__ void @@ -82,7 +82,6 @@ Kernel4() } template __global__ void Kernel4<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>(); -// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256} #ifdef USE_MAX_BLOCKS template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp> @@ -93,7 +92,6 @@ Kernel4_sm_90() } template __global__ void Kernel4_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>(); -// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxntidx", i32 256} #endif //USE_MAX_BLOCKS const int constint = 100; @@ -106,8 +104,6 @@ Kernel5() } template __global__ void Kernel5<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>(); -// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"maxntidx", i32 356} - #ifdef USE_MAX_BLOCKS template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp> @@ -120,7 +116,6 @@ Kernel5_sm_90() } template __global__ void Kernel5_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>(); -// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxntidx", i32 356} #endif //USE_MAX_BLOCKS // Make sure we don't emit negative launch bounds values. @@ -129,15 +124,12 @@ __launch_bounds__( -MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP ) Kernel6() { } -// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel6{{.*}}, !"maxntidx", __global__ void __launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP ) Kernel7() { } -// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"maxntidx", -// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"minctasm", #ifdef USE_MAX_BLOCKS __global__ void @@ -145,17 +137,12 @@ __launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP, -MAX_BLOCKS_PER_MP Kernel7_sm_90() { } -// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxntidx", -// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"minctasm", -// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxclusterrank", #endif // USE_MAX_BLOCKS const char constchar = 12; __global__ void __launch_bounds__(constint, constchar) Kernel8() {} -// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100 #ifdef USE_MAX_BLOCKS const char constchar_2 = 14; __global__ void __launch_bounds__(constint, constchar, constchar_2) Kernel8_sm_90() {} -// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxntidx", i32 100 #endif // USE_MAX_BLOCKS diff --git a/clang/test/OpenMP/ompx_attributes_codegen.cpp b/clang/test/OpenMP/ompx_attributes_codegen.cpp index 6c163c1875171..7cdbe8b9d788a 100644 --- a/clang/test/OpenMP/ompx_attributes_codegen.cpp +++ b/clang/test/OpenMP/ompx_attributes_codegen.cpp @@ -15,6 +15,10 @@ void func() { // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}}) // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #4 + // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}}) #[[ATTR0:[0-9]+]] + // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}}) #[[ATTR1:[0-9]+]] + // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #[[ATTR2:[0-9]+]] + #pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]]) {} #pragma omp target teams ompx_attribute(__attribute__((launch_bounds(45, 90)))) @@ -34,9 +38,12 @@ void func() { // AMD-SAME: "omp_target_thread_limit"="17" // It is unclear if we should use the AMD annotations for other targets, we do for now. -// NVIDIA: "omp_target_thread_limit"="20" -// NVIDIA: "omp_target_thread_limit"="45" -// NVIDIA: "omp_target_thread_limit"="17" -// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l18, !"maxntidx", i32 20} -// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l20, !"maxntidx", i32 45} -// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l22, !"maxntidx", i32 17} +// NVIDIA: attributes #[[ATTR0]] +// NVIDIA-SAME: "omp_target_thread_limit"="20" +// NVIDIA-SAME: "nvvm.maxntid"="20" +// NVIDIA: attributes #[[ATTR1]] +// NVIDIA-SAME: "omp_target_thread_limit"="45" +// NVIDIA-SAME: "nvvm.maxntid"="45" +// NVIDIA: attributes #[[ATTR2]] +// NVIDIA-SAME: "omp_target_thread_limit"="17" +// NVIDIA-SAME: "nvvm.maxntid"="17" diff --git a/clang/test/OpenMP/thread_limit_nvptx.c b/clang/test/OpenMP/thread_limit_nvptx.c index 2132e1aa7834a..ffa6c453067d1 100644 --- a/clang/test/OpenMP/thread_limit_nvptx.c +++ b/clang/test/OpenMP/thread_limit_nvptx.c @@ -7,23 +7,21 @@ #define HEADER void foo(int N) { -// CHECK: l11, !"maxntidx", i32 128} +// CHECK: define {{.*}}l11{{.*}} #[[ATTR0:[0-9]+]] #pragma omp target teams distribute parallel for simd for (int i = 0; i < N; ++i) ; -// CHECK: l15, !"maxntidx", i32 4} +// CHECK: define {{.*}}l15{{.*}} #[[ATTR1:[0-9]+]] #pragma omp target teams distribute parallel for simd thread_limit(4) for (int i = 0; i < N; ++i) ; -// CHECK-NOT: l21, !"maxntidx", i32 128} -// CHECK: l21, !"maxntidx", i32 42} -// CHECK-NOT: l21, !"maxntidx", i32 128} + +// CHECK: define {{.*}}l20{{.*}} #[[ATTR2:[0-9]+]] #pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) for (int i = 0; i < N; ++i) ; -// CHECK-NOT: l27, !"maxntidx", i32 42} -// CHECK: l27, !"maxntidx", i32 22} -// CHECK-NOT: l27, !"maxntidx", i32 42} + +// CHECK: define {{.*}}l25{{.*}} #[[ATTR3:[0-9]+]] #pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22) for (int i = 0; i < N; ++i) ; @@ -31,3 +29,7 @@ void foo(int N) { #endif +// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="128" {{.*}}} +// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="4" {{.*}}} +// CHECK: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="42" {{.*}}} +// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxntid"="22" {{.*}}} diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 8550af456e961..f17d7ddd75f19 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -74,6 +74,23 @@ Function Attributes This attribute indicates the maximum number of registers to be used for the kernel function. +``"nvvm.maxntid"="<x>[,<y>[,<z>]]"`` + This attribute declares the maximum number of threads in the thread block + (CTA). The maximum number of threads is the product of the maximum extent in + each dimension. Exceeding the maximum number of threads results in a runtime + error or kernel launch failure. + +``"nvvm.reqntid"="<x>[,<y>[,<z>]]"`` + This attribute declares the exact number of threads in the thread block + (CTA). The number of threads is the product of the value in each dimension. + Specifying a different CTA dimension at launch will result in a runtime + error or kernel launch failure. + +``"nvvm.cluster_dim"="<x>[,<y>[,<z>]]"`` + This attribute declares the number of thread blocks (CTAs) in the cluster. + The total number of CTAs is the product of the number of CTAs in each + dimension. Specifying a different cluster dimension at launch will result in + a runtime error or kernel launch failure. Only supported for Hopper+. .. _address_spaces: diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 18bc82fc827f7..a56e66fbab0a1 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -6366,45 +6366,13 @@ void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc, KernelEnvironmentGV->setInitializer(NewInitializer); } -static MDNode *getNVPTXMDNode(Function &Kernel, StringRef Name) { - Module &M = *Kernel.getParent(); - NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); - for (auto *Op : MD->operands()) { - if (Op->getNumOperands() != 3) - continue; - auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0)); - if (!KernelOp || KernelOp->getValue() != &Kernel) - continue; - auto *Prop = dyn_cast<MDString>(Op->getOperand(1)); - if (!Prop || Prop->getString() != Name) - continue; - return Op; - } - return nullptr; -} - -static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value, - bool Min) { - // Update the "maxntidx" metadata for NVIDIA, or add it. - MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name); - if (ExistingOp) { - auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2)); - int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue(); - ExistingOp->replaceOperandWith( - 2, ConstantAsMetadata::get(ConstantInt::get( - OldVal->getValue()->getType(), - Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value)))); - } else { - LLVMContext &Ctx = Kernel.getContext(); - Metadata *MDVals[] = {ConstantAsMetadata::get(&Kernel), - MDString::get(Ctx, Name), - ConstantAsMetadata::get( - ConstantInt::get(Type::getInt32Ty(Ctx), Value))}; - // Append metadata to nvvm.annotations - Module &M = *Kernel.getParent(); - NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); - MD->addOperand(MDNode::get(Ctx, MDVals)); +static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, + bool Min) { + if (Kernel.hasFnAttribute(Name)) { + int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name); + Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value); } + Kernel.addFnAttr(Name, llvm::utostr(Value)); } std::pair<int32_t, int32_t> @@ -6426,9 +6394,8 @@ OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) { return {LB, UB}; } - if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) { - auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2)); - int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue(); + if (Kernel.hasFnAttribute("nvvm.maxntid")) { + int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid"); return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB}; } return {0, ThreadLimit}; @@ -6445,7 +6412,7 @@ void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T, return; } - updateNVPTXMetadata(Kernel, "maxntidx", UB, true); + updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true); } std::pair<int32_t, int32_t> diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 57072715366c9..dc18ba9780fec 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -13,11 +13,13 @@ //===----------------------------------------------------------------------===// #include "llvm/IR/AutoUpgrade.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/IR/AttributeMask.h" +#include "llvm/IR/Attributes.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfo.h" @@ -46,6 +48,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Regex.h" #include "llvm/TargetParser/Triple.h" +#include <cstdint> #include <cstring> #include <numeric> @@ -5021,6 +5024,36 @@ bool llvm::UpgradeDebugInfo(Module &M) { return Modified; } +static void upgradeNVVMFnVectorAttr(const StringRef Attr, const char DimC, + GlobalValue *GV, const Metadata *V) { + Function *F = cast<Function>(GV); + + constexpr StringLiteral DefaultValue = "1"; + StringRef Vect3[3] = {DefaultValue, DefaultValue, DefaultValue}; + unsigned Length = 0; + + if (F->hasFnAttribute(Attr)) { + StringRef S = F->getFnAttribute(Attr).getValueAsString(); + for (; Length < 3 && !S.empty(); Length++) { + auto [Part, Rest] = S.split(','); + Vect3[Length] = Part.trim(); + S = Rest; + } + } + + const uint64_t VInt = mdconst::extract<ConstantInt>(V)->getZExtValue(); + const std::string VStr = llvm::utostr(VInt); + + const unsigned Dim = DimC - 'x'; + assert(Dim >= 0 && Dim < 3 && "Unexpected dim char"); + + Vect3[Dim] = VStr; + Length = std::max(Length, Dim + 1); + + const std::string NewAttr = llvm::join(ArrayRef(Vect3, Length), ","); + F->addFnAttr(Attr, NewAttr); +} + bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K, const Metadata *V) { if (K == "kernel") { @@ -5059,6 +5092,18 @@ bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K, cast<Function>(GV)->addFnAttr("nvvm.maxnreg", llvm::utostr(CV)); return true; } + if (K.consume_front("maxntid") && (K == "x" || K == "y" || K == "z")) { + upgradeNVVMFnVectorAttr("nvvm.maxntid", K[0], GV, V); + return true; + } + if (K.consume_front("reqntid") && (K == "x" || K == "y" || K == "z")) { + upgradeNVVMFnVectorAttr("nvvm.reqntid", K[0], GV, V); + return true; + } + if (K.consume_front("cluster_dim_") && (K == "x" || K == "y" || K == "z")) { + upgradeNVVMFnVectorAttr("nvvm.cluster_dim", K[0], GV, V); + return true; + } return false; } diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index c8e29c1da6ec4..f98d9609ff135 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -35,6 +35,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -506,24 +507,15 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, // If the NVVM IR has some of reqntid* specified, then output // the reqntid directive, and set the unspecified ones to 1. // If none of Reqntid* is specified, don't output reqntid directive. - std::optional<unsigned> Reqntidx = getReqNTIDx(F); - std::optional<unsigned> Reqntidy = getReqNTIDy(F); - std::optional<unsigned> Reqntidz = getReqNTIDz(F); + const auto ReqNTID = getReqNTID(F); + if (!ReqNTID.empty()) + O << formatv(".reqntid {0:$[, ]}\n", + make_range(ReqNTID.begin(), ReqNTID.end())); - if (Reqntidx || Reqntidy || Reqntidz) - O << ".reqntid " << Reqntidx.value_or(1) << ", " << Reqntidy.value_or(1) - << ", " << Reqntidz.value_or(1) << "\n"; - - // If the NVVM IR has some of maxntid* specified, then output - // the maxntid directive, and set the unspecified ones to 1. - // If none of maxntid* is specified, don't output maxntid directive. - std::optional<unsigned> Maxntidx = getMaxNTIDx(F); - std::optional<unsigned> Maxntidy = getMaxNTIDy(F); - std::optional<unsigned> Maxntidz = getMaxNTIDz(F); - - if (Maxntidx || Maxntidy || Maxntidz) - O << ".maxntid " << Maxntidx.value_or(1) << ", " << Maxntidy.value_or(1) - << ", " << Maxntidz.value_or(1) << "\n"; + const auto MaxNTID = getMaxNTID(F); + if (!MaxNTID.empty()) + O << formatv(".maxntid {0:$[, ]}\n", + make_range(MaxNTID.begin(), MaxNTID.end())); if (const auto Mincta = getMinCTASm(F)) O << ".minnctapersm " << *Mincta << "\n"; @@ -537,21 +529,19 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl()); if (STI->getSmVersion() >= 90) { - std::optional<unsigned> ClusterX = getClusterDimx(F); - std::optional<unsigned> ClusterY = getClusterDimy(F); - std::optional<unsigned> ClusterZ = getClusterDimz(F); + const auto ClusterDim = getClusterDim(F); - if (ClusterX || ClusterY || ClusterZ) { + if (!ClusterDim.empty()) { O << ".explicitcluster\n"; - if (ClusterX.value_or(1) != 0) { - assert(ClusterY.value_or(1) && ClusterZ.value_or(1) && + if (ClusterDim[0] != 0) { + assert(llvm::all_of(ClusterDim, [](unsigned D) { return D != 0; }) && "cluster_dim_x != 0 implies cluster_dim_y and cluster_dim_z " "should be non-zero as well"); - O << ".reqnctapercluster " << ClusterX.value_or(1) << ", " - << ClusterY.value_or(1) << ", " << ClusterZ.value_or(1) << "\n"; + O << formatv(".reqnctapercluster {0:$[, ]}\n", + make_range(ClusterDim.begin(), ClusterDim.end())); } else { - assert(!ClusterY.value_or(1) && !ClusterZ.value_or(1) && + assert(llvm::all_of(ClusterDim, [](unsigned D) { return D == 0; }) && "cluster_dim_x == 0 implies cluster_dim_y and cluster_dim_z " "should be 0 as well"); } diff --git a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp index ae5922cba4ce3..b10e0b14118a1 100644 --- a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp @@ -50,33 +50,10 @@ static std::string getHash(StringRef Str) { return llvm::utohexstr(Hash.low(), /*LowerCase=*/true); } -static void addKernelMetadata(Module &M, Function *F) { - llvm::LLVMContext &Ctx = M.getContext(); - - // Get "nvvm.annotations" metadata node. - llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); - - // This kernel is only to be called single-threaded. - llvm::Metadata *ThreadXMDVals[] = { - llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidx"), - llvm::ConstantAsMetadata::get( - llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; - llvm::Metadata *ThreadYMDVals[] = { - llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidy"), - llvm::ConstantAsMetadata::get( - llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; - llvm::Metadata *ThreadZMDVals[] = { - llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidz"), - llvm::ConstantAsMetadata::get( - llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; - +static void addKernelAttrs(Function *F) { F->addFnAttr("nvvm.maxclusterrank", "1"); + F->addFnAttr("nvvm.maxntid", "1"); F->setCallingConv(CallingConv::PTX_Kernel); - - // Append metadata to nvvm.annotations. - MD->addOperand(llvm::MDNode::get(Ctx, ThreadXMDVals)); - MD->addOperand(llvm::MDNode::get(Ctx, ThreadYMDVals)); - MD->addOperand(llvm::MDNode::get(Ctx, ThreadZMDVals)); } static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) { @@ -88,7 +65,7 @@ static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) { Function *InitOrFiniKernel = Function::createWithDefaultAttr( FunctionType::get(Type::getVoidTy(M.getContext()), false), GlobalValue::WeakODRLinkage, 0, InitOrFiniKernelName, &M); - addKernelMetadata(M, InitOrFiniKernel); + addKernelAttrs(InitOrFiniKernel); return InitOrFiniKernel; } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 85e99d7fe97a2..1d0aa8981d58a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -567,13 +567,14 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, void NVPTXTTIImpl::collectKernelLaunchBounds( const Function &F, SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const { - std::optional<unsigned> Val; - if ((Val = getMaxClusterRank(F))) + if (const auto Val = getMaxClusterRank(F)) LB.push_back({"maxclusterrank", *Val}); - if ((Val = getMaxNTIDx(F))) - LB.push_back({"maxntidx", *Val}); - if ((Val = getMaxNTIDy(F))) - LB.push_back({"maxntidy", *Val}); - if ((Val = getMaxNTIDz(F))) - LB.push_back({"maxntidz", *Val}); + + const auto MaxNTID = getMaxNTID(F); + if (MaxNTID.size() > 0) + LB.push_back({"maxntidx", MaxNTID[0]}); + if (MaxNTID.size() > 1) + LB.push_back({"maxntidy", MaxNTID[1]}); + if (MaxNTID.size() > 2) + LB.push_back({"maxntidz", MaxNTID[2]}); } diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index cdb0f559d78b4..0a707bf06095d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -13,6 +13,8 @@ #include "NVPTXUtilities.h" #include "NVPTX.h" #include "NVPTXTargetMachine.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" @@ -196,6 +198,36 @@ static std::optional<unsigned> getFnAttrParsedInt(const Function &F, : std::nullopt; } +static SmallVector<unsigned, 3> getFnAttrParsedVector(const Function &F, + StringRef Attr) { + SmallVector<unsigned, 3> V; + auto &Ctx = F.getContext(); + + if (F.hasFnAttribute(Attr)) { + StringRef S = F.getFnAttribute(Attr).getValueAsString(); + for (unsigned I = 0; I < 3 && !S.empty(); I++) { + auto [First, Rest] = S.split(","); + unsigned IntVal; + if (First.trim().getAsInteger(0, IntVal)) + Ctx.emitError("can't parse integer attribute " + First + " in " + Attr); + + V.push_back(IntVal); + S = Rest; + } + } + return V; +} + +static std::optional<unsigned> getVectorProduct(ArrayRef<unsigned> V) { + if (V.empty()) + return std::nullopt; + + unsigned Product = 1; + for (const unsigned E : V) + Product *= E; + return Product; +} + bool isParamGridConstant(const Value &V) { if (const Argument *Arg = dyn_cast<Argument>(&V)) { // "grid_constant" counts argument indices starting from 1 @@ -254,71 +286,39 @@ StringRef getSamplerName(const Value &V) { return V.getName(); } -std::optional<unsigned> getMaxNTIDx(const Function &F) { - return findOneNVVMAnnotation(&F, "maxntidx"); +SmallVector<unsigned, 3> getMaxNTID(const Function &F) { + return getFnAttrParsedVector(F, "nvvm.maxntid"); } -std::optional<unsigned> getMaxNTIDy(const Function &F) { - return findOneNVVMAnnotation(&F, "maxntidy"); +SmallVector<unsigned, 3> getReqNTID(const Function &F) { + return getFnAttrParsedVector(F, "nvvm.reqntid"); } -std::optional<unsigned> getMaxNTIDz(const Function &F) { - return findOneNVVMAnnotation(&F, "maxntidz"); +SmallVector<unsigned, 3> getClusterDim(const Function &F) { + return getFnAttrParsedVector(F, "nvvm.cluster_dim"); } -std::optional<unsigned> getMaxNTID(const Function &F) { +std::optional<unsigned> getOverallMaxNTID(const Function &F) { // Note: The semantics here are a bit strange. The PTX ISA states the // following (11.4.2. Performance-Tuning Directives: .maxntid): // // Note that this directive guarantees that the total number of threads does // not exceed the maximum, but does not guarantee that the limit in any // particular dimension is not exceeded. - std::optional<unsigned> MaxNTIDx = getMaxNTIDx(F); - std::optional<unsigned> MaxNTIDy = getMaxNTIDy(F); - std::optional<unsigned> MaxNTIDz = getMaxNTIDz(F); - if (MaxNTIDx || MaxNTIDy || MaxNTIDz) - return MaxNTIDx.value_or(1) * MaxNTIDy.value_or(1) * MaxNTIDz.value_or(1); - return std::nullopt; -} - -std::optional<unsigned> getClusterDimx(const Function &F) { - return findOneNVVMAnnotation(&F, "cluster_dim_x"); + const auto MaxNTID = getMaxNTID(F); + return getVectorProduct(MaxNTID); } -std::optional<unsigned> getClusterDimy(const Function &F) { - return findOneNVVMAnnotation(&F, "cluster_dim_y"); -} - -std::optional<unsigned> getClusterDimz(const Function &F) { - return findOneNVVMAnnotation(&F, "cluster_dim_z"); +std::optional<unsigned> getOverallReqNTID(const Function &F) { + // Note: The semantics here are a bit strange. See getMaxNTID. + const auto ReqNTID = getReqNTID(F); + return getVectorProduct(ReqNTID); } std::optional<unsigned> getMaxClusterRank(const Function &F) { return getFnAttrParsedInt(F, "nvvm.maxclusterrank"); } -std::optional<unsigned> getReqNTIDx(const Function &F) { - return findOneNVVMAnnotation(&F, "reqntidx"); -} - -std::optional<unsigned> getReqNTIDy(const Function &F) { - return findOneNVVMAnnotation(&F, "reqntidy"); -} - -std::optional<unsigned> getReqNTIDz(const Function &F) { - return findOneNVVMAnnotation(&F, "reqntidz"); -} - -std::optional<unsigned> getReqNTID(const Function &F) { - // Note: The semantics here are a bit strange. See getMaxNTID. - std::optional<unsigned> ReqNTIDx = getReqNTIDx(F); - std::optional<unsigned> ReqNTIDy = getReqNTIDy(F); - std::optional<unsigned> ReqNTIDz = getReqNTIDz(F); - if (ReqNTIDx || ReqNTIDy || ReqNTIDz) - return ReqNTIDx.value_or(1) * ReqNTIDy.value_or(1) * ReqNTIDz.value_or(1); - return std::nullopt; -} - std::optional<unsigned> getMinCTASm(const Function &F) { return getFnAttrParsedInt(F, "nvvm.minctasm"); } diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index cf35eaf4cbae5..cecd003bf23a4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -14,6 +14,7 @@ #define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H #include "NVPTX.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/CallingConv.h" @@ -47,19 +48,12 @@ StringRef getTextureName(const Value &); StringRef getSurfaceName(const Value &); StringRef getSamplerName(const Value &); -std::optional<unsigned> getMaxNTIDx(const Function &); -std::optional<unsigned> getMaxNTIDy(const Function &); -std::optional<unsigned> getMaxNTIDz(const Function &); -std::optional<unsigned> getMaxNTID(const Function &); +SmallVector<unsigned, 3> getMaxNTID(const Function &); +SmallVector<unsigned, 3> getReqNTID(const Function &); +SmallVector<unsigned, 3> getClusterDim(const Function &); -std::optional<unsigned> getReqNTIDx(const Function &); -std::optional<unsigned> getReqNTIDy(const Function &); -std::optional<unsigned> getReqNTIDz(const Function &); -std::optional<unsigned> getReqNTID(const Function &); - -std::optional<unsigned> getClusterDimx(const Function &); -std::optional<unsigned> getClusterDimy(const Function &); -std::optional<unsigned> getClusterDimz(const Function &); +std::optional<unsigned> getOverallMaxNTID(const Function &); +std::optional<unsigned> getOverallReqNTID(const Function &); std::optional<unsigned> getMaxClusterRank(const Function &); std::optional<unsigned> getMinCTASm(const Function &); diff --git a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp index 8dd46f9a1402d..8286e9661f202 100644 --- a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp +++ b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp @@ -67,8 +67,8 @@ static bool runNVVMIntrRange(Function &F) { unsigned x, y, z; } MaxBlockSize, MaxGridSize; - const unsigned MetadataNTID = getReqNTID(F).value_or( - getMaxNTID(F).value_or(std::numeric_limits<unsigned>::max())); + const unsigned MetadataNTID = getOverallReqNTID(F).value_or( + getOverallMaxNTID(F).value_or(std::numeric_limits<unsigned>::max())); MaxBlockSize.x = std::min(1024u, MetadataNTID); MaxBlockSize.y = std::min(1024u, MetadataNTID); diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll index a0c06083c270b..f5cabb4e6488d 100644 --- a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll @@ -24,11 +24,11 @@ attributes #0 = { "omp_target_num_teams"="100" "omp_target_thread_limit"="101" "nvvm.maxclusterrank"="200" + "nvvm.maxntid"="210,211,212" } !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} -!nvvm.annotations = !{!7, !8, !9, !10} !0 = !{i32 2, !"Debug Info Version", i32 3} !1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) @@ -36,7 +36,3 @@ attributes #0 = { !3 = !{} !4 = !DISubroutineType(types: !3) !5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) -!7 = !{ptr @test, !"maxntidx", i32 210} -!8 = !{ptr @test, !"maxntidy", i32 211} -!9 = !{ptr @test, !"maxntidz", i32 212} -!10 = distinct !{ptr null, !"kernel", i32 1} diff --git a/llvm/test/CodeGen/NVPTX/annotations.ll b/llvm/test/CodeGen/NVPTX/annotations.ll index 1f888d7fb21f1..5360e8988777b 100644 --- a/llvm/test/CodeGen/NVPTX/annotations.ll +++ b/llvm/test/CodeGen/NVPTX/annotations.ll @@ -9,14 +9,14 @@ ; CHECK: .global .surfref surface ; CHECK: .entry kernel_func_maxntid -define void @kernel_func_maxntid(ptr %a) { +define ptx_kernel void @kernel_func_maxntid(ptr %a) "nvvm.maxntid"="10,20,30" { ; CHECK: .maxntid 10, 20, 30 ; CHECK: ret ret void } ; CHECK: .entry kernel_func_reqntid -define void @kernel_func_reqntid(ptr %a) { +define ptx_kernel void @kernel_func_reqntid(ptr %a) "nvvm.reqntid"="11,22,33" { ; CHECK: .reqntid 11, 22, 33 ; CHECK: ret ret void @@ -36,13 +36,7 @@ define ptx_kernel void @kernel_func_maxnreg() "nvvm.maxnreg"="1234" { ret void } -!nvvm.annotations = !{!1, !2, !3, !4, !9, !10} - -!1 = !{ptr @kernel_func_maxntid, !"kernel", i32 1} -!2 = !{ptr @kernel_func_maxntid, !"maxntidx", i32 10, !"maxntidy", i32 20, !"maxntidz", i32 30} - -!3 = !{ptr @kernel_func_reqntid, !"kernel", i32 1} -!4 = !{ptr @kernel_func_reqntid, !"reqntidx", i32 11, !"reqntidy", i32 22, !"reqntidz", i32 33} +!nvvm.annotations = !{!9, !10} !9 = !{ptr addrspace(1) @texture, !"texture", i32 1} !10 = !{ptr addrspace(1) @surface, !"surface", i32 1} diff --git a/llvm/test/CodeGen/NVPTX/bug26185-2.ll b/llvm/test/CodeGen/NVPTX/bug26185-2.ll index 89cafcede06bd..c0bbf5b3559bb 100644 --- a/llvm/test/CodeGen/NVPTX/bug26185-2.ll +++ b/llvm/test/CodeGen/NVPTX/bug26185-2.ll @@ -26,8 +26,4 @@ bb: ret void } -attributes #0 = { norecurse nounwind "polly.skip.fn" } - -!nvvm.annotations = !{!0} - -!0 = !{ptr @spam, !"maxntidx", i64 1, !"maxntidy", i64 1, !"maxntidz", i64 1} +attributes #0 = { norecurse nounwind "polly.skip.fn" "nvvm.maxntid"="1,1,1" } diff --git a/llvm/test/CodeGen/NVPTX/cluster-dim.ll b/llvm/test/CodeGen/NVPTX/cluster-dim.ll index 9275c895b224a..196b967ce8685 100644 --- a/llvm/test/CodeGen/NVPTX/cluster-dim.ll +++ b/llvm/test/CodeGen/NVPTX/cluster-dim.ll @@ -3,7 +3,7 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 | FileCheck -check-prefixes=CHECK90 %s ; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %} -define ptx_kernel void @kernel_func_clusterxyz() { +define ptx_kernel void @kernel_func_clusterxyz() "nvvm.cluster_dim"="3,5,7" { ; CHECK80-LABEL: kernel_func_clusterxyz( ; CHECK80: { ; CHECK80-EMPTY: @@ -21,8 +21,3 @@ define ptx_kernel void @kernel_func_clusterxyz() { ; CHECK90-NEXT: ret; ret void } - - -!nvvm.annotations = !{!1} - -!1 = !{ptr @kernel_func_clusterxyz, !"cluster_dim_x", i32 3, !"cluster_dim_y", i32 5, !"cluster_dim_z", i32 7} diff --git a/llvm/test/CodeGen/NVPTX/intr-range.ll b/llvm/test/CodeGen/NVPTX/intr-range.ll index 86776ab09efc6..884a4b1a3584f 100644 --- a/llvm/test/CodeGen/NVPTX/intr-range.ll +++ b/llvm/test/CodeGen/NVPTX/intr-range.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 5 ; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -mcpu=sm_20 -passes=nvvm-intr-range | FileCheck %s -define ptx_kernel i32 @test_maxntid() { +define ptx_kernel i32 @test_maxntid() "nvvm.maxntid"="32,1,3" { ; CHECK-LABEL: define ptx_kernel i32 @test_maxntid( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 96) i32 @llvm.nvvm.read.ptx.sreg.tid.x() @@ -31,9 +31,9 @@ define ptx_kernel i32 @test_maxntid() { ret i32 %11 } -define ptx_kernel i32 @test_reqntid() { +define ptx_kernel i32 @test_reqntid() "nvvm.reqntid"="20" { ; CHECK-LABEL: define ptx_kernel i32 @test_reqntid( -; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.x() ; CHECK-NEXT: [[TMP5:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.y() ; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.z() @@ -64,9 +64,9 @@ define ptx_kernel i32 @test_reqntid() { ;; A case like this could occur if a function with the sreg intrinsic was ;; inlined into a kernel where the tid metadata is present, ensure the range is ;; updated. -define ptx_kernel i32 @test_inlined() { +define ptx_kernel i32 @test_inlined() "nvvm.maxntid"="4" { ; CHECK-LABEL: define ptx_kernel i32 @test_inlined( -; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 4) i32 @llvm.nvvm.read.ptx.sreg.tid.x() ; CHECK-NEXT: ret i32 [[TMP1]] ; @@ -81,8 +81,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z() - -!nvvm.annotations = !{!0, !1, !2} -!0 = !{ptr @test_maxntid, !"maxntidx", i32 32, !"maxntidz", i32 3} -!1 = !{ptr @test_reqntid, !"reqntidx", i32 20} -!2 = !{ptr @test_inlined, !"maxntidx", i32 4} diff --git a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll index 3b73c36de4b89..02118fbf741bf 100644 --- a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll +++ b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll @@ -84,11 +84,4 @@ define internal void @bar() { ; CHECK: while.end: ; CHECK-NEXT: ret void -; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="1" } - -; CHECK: [[META1:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidx", i32 1} -; CHECK: [[META2:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidy", i32 1} -; CHECK: [[META3:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidz", i32 1} -; CHECK: [[META4:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidx", i32 1} -; CHECK: [[META5:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidy", i32 1} -; CHECK: [[META6:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidz", i32 1} +; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="1" "nvvm.maxntid"="1" } diff --git a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll index 51483296dd34f..ce8181e9a70e2 100644 --- a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll +++ b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll @@ -4,20 +4,16 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" target triple = "nvptx64-unknown-unknown" -; CHECK: .maxntid 128, 1, 1 +; CHECK: .maxntid 128 ; CHECK: .minnctapersm 2 ; CHECK_SM_90: .maxclusterrank 8 ; CHECK_SM_80-NOT: .maxclusterrank 8 ; Make sure that for SM version prior to 90 `.maxclusterrank` directive is ; silently ignored. -define dso_local ptx_kernel void @_Z18TestMaxClusterRankv() "nvvm.minctasm"="2" "nvvm.maxclusterrank"="8" { +define dso_local ptx_kernel void @_Z18TestMaxClusterRankv() "nvvm.minctasm"="2" "nvvm.maxclusterrank"="8" "nvvm.maxntid"="128" { entry: %a = alloca i32, align 4 store volatile i32 1, ptr %a, align 4 ret void } - -!nvvm.annotations = !{!1} - -!1 = !{ptr @_Z18TestMaxClusterRankv, !"maxntidx", i32 128} diff --git a/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll b/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll index 3a1f59454493c..7c5d3d37d5508 100644 --- a/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll +++ b/llvm/test/CodeGen/NVPTX/upgrade-nvvm-annotations.ll @@ -48,7 +48,55 @@ define void @test_maxnreg() { ret void } -!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6} +define void @test_maxntid_1() { +; CHECK-LABEL: define void @test_maxntid_1( +; CHECK-SAME: ) #[[ATTR4:[0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_maxntid_2() { +; CHECK-LABEL: define void @test_maxntid_2( +; CHECK-SAME: ) #[[ATTR5:[0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_maxntid_3() { +; CHECK-LABEL: define void @test_maxntid_3( +; CHECK-SAME: ) #[[ATTR6:[0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_maxntid_4() { +; CHECK-LABEL: define void @test_maxntid_4( +; CHECK-SAME: ) #[[ATTR7:[0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_reqntid() { +; CHECK-LABEL: define void @test_reqntid( +; CHECK-SAME: ) #[[ATTR8:[0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +define void @test_cluster_dim() { +; CHECK-LABEL: define void @test_cluster_dim( +; CHECK-SAME: ) #[[ATTR9:[0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12} !0 = !{ptr @test_align, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020010} !1 = !{null, !"align", i32 u0x00000008, !"align", i32 u0x00010008, !"align", i32 u0x00020008} @@ -57,12 +105,24 @@ define void @test_maxnreg() { !4 = !{ptr @test_cluster_max_blocks, !"cluster_max_blocks", i32 3} !5 = !{ptr @test_minctasm, !"minctasm", i32 4} !6 = !{ptr @test_maxnreg, !"maxnreg", i32 5} +!7 = !{ptr @test_maxntid_1, !"maxntidx", i32 50} +!8 = !{ptr @test_maxntid_2, !"maxntidx", i32 11, !"maxntidy", i32 22, !"maxntidz", i32 33} +!9 = !{ptr @test_maxntid_3, !"maxntidz", i32 11, !"maxntidy", i32 22, !"maxntidx", i32 33} +!10 = !{ptr @test_maxntid_4, !"maxntidz", i32 100} +!11 = !{ptr @test_reqntid, !"reqntidx", i32 31, !"reqntidy", i32 32, !"reqntidz", i32 33} +!12 = !{ptr @test_cluster_dim, !"cluster_dim_x", i32 101, !"cluster_dim_y", i32 102, !"cluster_dim_z", i32 103} ;. ; CHECK: attributes #[[ATTR0]] = { "nvvm.maxclusterrank"="2" } ; CHECK: attributes #[[ATTR1]] = { "nvvm.maxclusterrank"="3" } ; CHECK: attributes #[[ATTR2]] = { "nvvm.minctasm"="4" } ; CHECK: attributes #[[ATTR3]] = { "nvvm.maxnreg"="5" } +; CHECK: attributes #[[ATTR4]] = { "nvvm.maxntid"="50" } +; CHECK: attributes #[[ATTR5]] = { "nvvm.maxntid"="11,22,33" } +; CHECK: attributes #[[ATTR6]] = { "nvvm.maxntid"="33,22,11" } +; CHECK: attributes #[[ATTR7]] = { "nvvm.maxntid"="1,1,100" } +; CHECK: attributes #[[ATTR8]] = { "nvvm.reqntid"="31,32,33" } +; CHECK: attributes #[[ATTR9]] = { "nvvm.cluster_dim"="101,102,103" } ;. ; CHECK: [[META0:![0-9]+]] = !{ptr @test_align, !"align", i32 8} ;. diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp index 8b13735774663..858beed959933 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp @@ -18,8 +18,10 @@ #include "mlir/Target/LLVMIR/ModuleTranslation.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicsNVPTX.h" +#include "llvm/Support/FormatVariadic.h" using namespace mlir; using namespace mlir::LLVM; @@ -183,48 +185,33 @@ class NVVMDialectLLVMIRTranslationInterface auto func = dyn_cast<LLVM::LLVMFuncOp>(op); if (!func) return failure(); - llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext(); llvm::Function *llvmFunc = moduleTranslation.lookupFunction(func.getName()); - auto generateMetadata = [&](int dim, StringRef name) { - llvm::Metadata *llvmMetadata[] = { - llvm::ValueAsMetadata::get(llvmFunc), - llvm::MDString::get(llvmContext, name), - llvm::ValueAsMetadata::get(llvm::ConstantInt::get( - llvm::Type::getInt32Ty(llvmContext), dim))}; - llvm::MDNode *llvmMetadataNode = - llvm::MDNode::get(llvmContext, llvmMetadata); - moduleTranslation.getOrInsertNamedModuleMetadata("nvvm.annotations") - ->addOperand(llvmMetadataNode); - }; if (attribute.getName() == NVVM::NVVMDialect::getMaxntidAttrName()) { if (!dyn_cast<DenseI32ArrayAttr>(attribute.getValue())) return failure(); auto values = cast<DenseI32ArrayAttr>(attribute.getValue()); - generateMetadata(values[0], NVVM::NVVMDialect::getMaxntidXName()); - if (values.size() > 1) - generateMetadata(values[1], NVVM::NVVMDialect::getMaxntidYName()); - if (values.size() > 2) - generateMetadata(values[2], NVVM::NVVMDialect::getMaxntidZName()); + const std::string attr = llvm::formatv( + "{0:$[,]}", llvm::make_range(values.asArrayRef().begin(), + values.asArrayRef().end())); + llvmFunc->addFnAttr("nvvm.maxntid", attr); } else if (attribute.getName() == NVVM::NVVMDialect::getReqntidAttrName()) { if (!dyn_cast<DenseI32ArrayAttr>(attribute.getValue())) return failure(); auto values = cast<DenseI32ArrayAttr>(attribute.getValue()); - generateMetadata(values[0], NVVM::NVVMDialect::getReqntidXName()); - if (values.size() > 1) - generateMetadata(values[1], NVVM::NVVMDialect::getReqntidYName()); - if (values.size() > 2) - generateMetadata(values[2], NVVM::NVVMDialect::getReqntidZName()); + const std::string attr = llvm::formatv( + "{0:$[,]}", llvm::make_range(values.asArrayRef().begin(), + values.asArrayRef().end())); + llvmFunc->addFnAttr("nvvm.reqntid", attr); } else if (attribute.getName() == NVVM::NVVMDialect::getClusterDimAttrName()) { if (!dyn_cast<DenseI32ArrayAttr>(attribute.getValue())) return failure(); auto values = cast<DenseI32ArrayAttr>(attribute.getValue()); - generateMetadata(values[0], NVVM::NVVMDialect::getClusterDimXName()); - if (values.size() > 1) - generateMetadata(values[1], NVVM::NVVMDialect::getClusterDimYName()); - if (values.size() > 2) - generateMetadata(values[2], NVVM::NVVMDialect::getClusterDimZName()); + const std::string attr = llvm::formatv( + "{0:$[,]}", llvm::make_range(values.asArrayRef().begin(), + values.asArrayRef().end())); + llvmFunc->addFnAttr("nvvm.cluster_dim", attr); } else if (attribute.getName() == NVVM::NVVMDialect::getClusterMaxBlocksAttrName()) { auto value = dyn_cast<IntegerAttr>(attribute.getValue()); diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index 5ab593452ab66..a9717d26d7854 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -590,33 +590,24 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 1, 2 llvm.return } -// CHECK: define ptx_kernel void @kernel_func -// CHECK: !nvvm.annotations = -// CHECK: {ptr @kernel_func, !"maxntidx", i32 1} -// CHECK: {ptr @kernel_func, !"maxntidy", i32 23} -// CHECK: {ptr @kernel_func, !"maxntidz", i32 32} +// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]] +// CHECK: attributes #[[ATTR0]] = { "nvvm.maxntid"="1,23,32" } // ----- llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.reqntid = array<i32: 1, 23, 32>} { llvm.return } -// CHECK: define ptx_kernel void @kernel_func -// CHECK: !nvvm.annotations = -// CHECK: {ptr @kernel_func, !"reqntidx", i32 1} -// CHECK: {ptr @kernel_func, !"reqntidy", i32 23} -// CHECK: {ptr @kernel_func, !"reqntidz", i32 32} +// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]] +// CHECK: attributes #[[ATTR0]] = { "nvvm.reqntid"="1,23,32" } // ----- llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.cluster_dim = array<i32: 3, 5, 7>} { llvm.return } -// CHECK: define ptx_kernel void @kernel_func -// CHECK: !nvvm.annotations = -// CHECK: {ptr @kernel_func, !"cluster_dim_x", i32 3} -// CHECK: {ptr @kernel_func, !"cluster_dim_y", i32 5} -// CHECK: {ptr @kernel_func, !"cluster_dim_z", i32 7} +// CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]] +// CHECK: attributes #[[ATTR0]] = { "nvvm.cluster_dim"="3,5,7" } // ----- llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.cluster_max_blocks = 8} { @@ -650,11 +641,7 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 1, 2 } // CHECK: define ptx_kernel void @kernel_func() #[[ATTR0:[0-9]+]] -// CHECK: attributes #[[ATTR0]] = { "nvvm.maxnreg"="32" "nvvm.minctasm"="16" } -// CHECK: !nvvm.annotations = -// CHECK: {ptr @kernel_func, !"maxntidx", i32 1} -// CHECK: {ptr @kernel_func, !"maxntidy", i32 23} -// CHECK: {ptr @kernel_func, !"maxntidz", i32 32} +// CHECK: attributes #[[ATTR0]] = { "nvvm.maxnreg"="32" "nvvm.maxntid"="1,23,32" "nvvm.minctasm"="16" } // ----- // CHECK: define ptx_kernel void @kernel_func >From 0a227b1a9eddd20b0a679f980faf61866c05fbe2 Mon Sep 17 00:00:00 2001 From: Alex Maclean <amacl...@nvidia.com> Date: Wed, 19 Feb 2025 20:52:54 +0000 Subject: [PATCH 2/2] address comments, fixup test --- clang/test/OpenMP/ompx_attributes_codegen.cpp | 18 +++++++++--------- llvm/lib/IR/AutoUpgrade.cpp | 17 +++++++++++------ llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 14 +++++++------- llvm/lib/Target/NVPTX/NVPTXUtilities.h | 4 ++-- 4 files changed, 29 insertions(+), 24 deletions(-) diff --git a/clang/test/OpenMP/ompx_attributes_codegen.cpp b/clang/test/OpenMP/ompx_attributes_codegen.cpp index 7cdbe8b9d788a..d68f00a81335c 100644 --- a/clang/test/OpenMP/ompx_attributes_codegen.cpp +++ b/clang/test/OpenMP/ompx_attributes_codegen.cpp @@ -11,13 +11,13 @@ // Check that the target attributes are set on the generated kernel void func() { - // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}}) #0 - // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}}) - // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #4 + // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #0 + // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l24(ptr {{[^,]+}}) + // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l26(ptr {{[^,]+}}) #4 - // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}}) #[[ATTR0:[0-9]+]] - // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}}) #[[ATTR1:[0-9]+]] - // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #[[ATTR2:[0-9]+]] + // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #[[ATTR0:[0-9]+]] + // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l24(ptr {{[^,]+}}) #[[ATTR1:[0-9]+]] + // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l26(ptr {{[^,]+}}) #[[ATTR2:[0-9]+]] #pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]]) {} @@ -39,11 +39,11 @@ void func() { // It is unclear if we should use the AMD annotations for other targets, we do for now. // NVIDIA: attributes #[[ATTR0]] -// NVIDIA-SAME: "omp_target_thread_limit"="20" // NVIDIA-SAME: "nvvm.maxntid"="20" +// NVIDIA-SAME: "omp_target_thread_limit"="20" // NVIDIA: attributes #[[ATTR1]] -// NVIDIA-SAME: "omp_target_thread_limit"="45" // NVIDIA-SAME: "nvvm.maxntid"="45" +// NVIDIA-SAME: "omp_target_thread_limit"="45" // NVIDIA: attributes #[[ATTR2]] -// NVIDIA-SAME: "omp_target_thread_limit"="17" // NVIDIA-SAME: "nvvm.maxntid"="17" +// NVIDIA-SAME: "omp_target_thread_limit"="17" diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index dc18ba9780fec..1daf15b481cf8 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -5033,6 +5033,8 @@ static void upgradeNVVMFnVectorAttr(const StringRef Attr, const char DimC, unsigned Length = 0; if (F->hasFnAttribute(Attr)) { + // We expect the existing attribute to have the form "x[,y[,z]]". Here we + // parse these elements placing them into Vect3 StringRef S = F->getFnAttribute(Attr).getValueAsString(); for (; Length < 3 && !S.empty(); Length++) { auto [Part, Rest] = S.split(','); @@ -5041,12 +5043,11 @@ static void upgradeNVVMFnVectorAttr(const StringRef Attr, const char DimC, } } - const uint64_t VInt = mdconst::extract<ConstantInt>(V)->getZExtValue(); - const std::string VStr = llvm::utostr(VInt); - const unsigned Dim = DimC - 'x'; assert(Dim >= 0 && Dim < 3 && "Unexpected dim char"); + const uint64_t VInt = mdconst::extract<ConstantInt>(V)->getZExtValue(); + const std::string VStr = llvm::utostr(VInt); Vect3[Dim] = VStr; Length = std::max(Length, Dim + 1); @@ -5054,6 +5055,10 @@ static void upgradeNVVMFnVectorAttr(const StringRef Attr, const char DimC, F->addFnAttr(Attr, NewAttr); } +static inline bool isXYZ(StringRef S) { + return S == "x" || S == "y" || S == "z"; +} + bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K, const Metadata *V) { if (K == "kernel") { @@ -5092,15 +5097,15 @@ bool static upgradeSingleNVVMAnnotation(GlobalValue *GV, StringRef K, cast<Function>(GV)->addFnAttr("nvvm.maxnreg", llvm::utostr(CV)); return true; } - if (K.consume_front("maxntid") && (K == "x" || K == "y" || K == "z")) { + if (K.consume_front("maxntid") && isXYZ(K)) { upgradeNVVMFnVectorAttr("nvvm.maxntid", K[0], GV, V); return true; } - if (K.consume_front("reqntid") && (K == "x" || K == "y" || K == "z")) { + if (K.consume_front("reqntid") && isXYZ(K)) { upgradeNVVMFnVectorAttr("nvvm.reqntid", K[0], GV, V); return true; } - if (K.consume_front("cluster_dim_") && (K == "x" || K == "y" || K == "z")) { + if (K.consume_front("cluster_dim_") && isXYZ(K)) { upgradeNVVMFnVectorAttr("nvvm.cluster_dim", K[0], GV, V); return true; } diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index 0a707bf06095d..d44a38cbde72a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/Module.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/Mutex.h" +#include <cstdint> #include <cstring> #include <map> #include <mutex> @@ -204,6 +205,8 @@ static SmallVector<unsigned, 3> getFnAttrParsedVector(const Function &F, auto &Ctx = F.getContext(); if (F.hasFnAttribute(Attr)) { + // We expect the attribute value to be of the form "x[,y[,z]]", where x, y, + // and z are unsigned values. StringRef S = F.getFnAttribute(Attr).getValueAsString(); for (unsigned I = 0; I < 3 && !S.empty(); I++) { auto [First, Rest] = S.split(","); @@ -218,14 +221,11 @@ static SmallVector<unsigned, 3> getFnAttrParsedVector(const Function &F, return V; } -static std::optional<unsigned> getVectorProduct(ArrayRef<unsigned> V) { +static std::optional<uint64_t> getVectorProduct(ArrayRef<unsigned> V) { if (V.empty()) return std::nullopt; - unsigned Product = 1; - for (const unsigned E : V) - Product *= E; - return Product; + return std::accumulate(V.begin(), V.end(), 1, std::multiplies<uint64_t>{}); } bool isParamGridConstant(const Value &V) { @@ -298,7 +298,7 @@ SmallVector<unsigned, 3> getClusterDim(const Function &F) { return getFnAttrParsedVector(F, "nvvm.cluster_dim"); } -std::optional<unsigned> getOverallMaxNTID(const Function &F) { +std::optional<uint64_t> getOverallMaxNTID(const Function &F) { // Note: The semantics here are a bit strange. The PTX ISA states the // following (11.4.2. Performance-Tuning Directives: .maxntid): // @@ -309,7 +309,7 @@ std::optional<unsigned> getOverallMaxNTID(const Function &F) { return getVectorProduct(MaxNTID); } -std::optional<unsigned> getOverallReqNTID(const Function &F) { +std::optional<uint64_t> getOverallReqNTID(const Function &F) { // Note: The semantics here are a bit strange. See getMaxNTID. const auto ReqNTID = getReqNTID(F); return getVectorProduct(ReqNTID); diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index cecd003bf23a4..4ed379765fc20 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -52,8 +52,8 @@ SmallVector<unsigned, 3> getMaxNTID(const Function &); SmallVector<unsigned, 3> getReqNTID(const Function &); SmallVector<unsigned, 3> getClusterDim(const Function &); -std::optional<unsigned> getOverallMaxNTID(const Function &); -std::optional<unsigned> getOverallReqNTID(const Function &); +std::optional<uint64_t> getOverallMaxNTID(const Function &); +std::optional<uint64_t> getOverallReqNTID(const Function &); std::optional<unsigned> getMaxClusterRank(const Function &); std::optional<unsigned> getMinCTASm(const Function &); _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits