https://github.com/lucas-rami updated https://github.com/llvm/llvm-project/pull/138284
>From 9906334cbabf160221dea11aa93e2cf7a154e6da Mon Sep 17 00:00:00 2001 From: Lucas Ramirez <lucas.r...@proton.me> Date: Thu, 22 May 2025 13:26:57 +0000 Subject: [PATCH 1/2] Rebase on main (integrate changes from 1b34722) --- clang/lib/CodeGen/Targets/AMDGPU.cpp | 27 ++++----- clang/lib/Sema/SemaAMDGPU.cpp | 5 -- clang/test/SemaOpenCL/amdgpu-attrs.cl | 1 - llvm/lib/IR/Verifier.cpp | 23 ++++++++ llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 4 ++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 46 ++++++++------- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 4 +- .../AMDGPU/attr-amdgpu-waves-per-eu.ll | 12 ++++ ...-work-group-size-overrides-waves-per-eu.ll | 4 +- ...ine-scheduler-sink-trivial-remats-attr.mir | 12 ++-- .../CodeGen/AMDGPU/propagate-waves-per-eu.ll | 56 ++++++++----------- .../Verifier/AMDGPU/amdgpu-waves-per-eu.ll | 40 +++++++++++++ 12 files changed, 153 insertions(+), 81 deletions(-) create mode 100644 llvm/test/Verifier/AMDGPU/amdgpu-waves-per-eu.ll diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index c8921c434db47..8072d4f363feb 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -743,20 +743,21 @@ void CodeGenModule::handleAMDGPUWavesPerEUAttr( llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) { unsigned Min = Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue(); - unsigned Max = - Attr->getMax() - ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue() - : 0; - if (Min != 0) { - assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max"); - - std::string AttrVal = llvm::utostr(Min); - if (Max != 0) - AttrVal = AttrVal + "," + llvm::utostr(Max); - F->addFnAttr("amdgpu-waves-per-eu", AttrVal); - } else - assert(Max == 0 && "Max must be zero"); + if (Attr->getMax()) { + unsigned Max = + Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue(); + assert(Min == 0 || (Min != 0 && Max != 0) && + "Min must be non-zero when Max is non-zero"); + assert(Min <= Max && "Min must be less than or equal to Max"); + // Do not add the attribute if min,max=0,0. + if (Min != 0) { + std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max); + F->addFnAttr("amdgpu-waves-per-eu", AttrVal); + } + } else if (Min != 0) { + F->addFnAttr("amdgpu-waves-per-eu", llvm::utostr(Min)); + } } std::unique_ptr<TargetCodeGenInfo> diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index e6414a623b929..9ae3ec1289def 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -244,11 +244,6 @@ static bool checkAMDGPUWavesPerEUArguments(Sema &S, Expr *MinExpr, if (MaxExpr && !S.checkUInt32Argument(Attr, MaxExpr, Max, 1)) return true; - if (Min == 0 && Max != 0) { - S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid) - << &Attr << 0; - return true; - } if (Max != 0 && Min > Max) { S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid) << &Attr << 1; diff --git a/clang/test/SemaOpenCL/amdgpu-attrs.cl b/clang/test/SemaOpenCL/amdgpu-attrs.cl index 89ba3f86803c5..b9b44dff4d4a9 100644 --- a/clang/test/SemaOpenCL/amdgpu-attrs.cl +++ b/clang/test/SemaOpenCL/amdgpu-attrs.cl @@ -46,7 +46,6 @@ __attribute__((amdgpu_num_sgpr(4294967296))) kernel void kernel_num_sgpr_L() {} __attribute__((amdgpu_num_vgpr(4294967296))) kernel void kernel_num_vgpr_L() {} // expected-error {{integer constant expression evaluates to value 4294967296 that cannot be represented in a 32-bit unsigned integer type}} __attribute__((amdgpu_flat_work_group_size(0, 64))) kernel void kernel_flat_work_group_size_0_64() {} // expected-error {{'amdgpu_flat_work_group_size' attribute argument is invalid: max must be 0 since min is 0}} -__attribute__((amdgpu_waves_per_eu(0, 4))) kernel void kernel_waves_per_eu_0_4() {} // expected-error {{'amdgpu_waves_per_eu' attribute argument is invalid: max must be 0 since min is 0}} __attribute__((amdgpu_flat_work_group_size(64, 32))) kernel void kernel_flat_work_group_size_64_32() {} // expected-error {{'amdgpu_flat_work_group_size' attribute argument is invalid: min must not be greater than max}} __attribute__((amdgpu_waves_per_eu(4, 2))) kernel void kernel_waves_per_eu_4_2() {} // expected-error {{'amdgpu_waves_per_eu' attribute argument is invalid: min must not be greater than max}} diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 73b4274a41ee6..402af44e38cbf 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -2473,6 +2473,29 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, CheckFailed("invalid value for 'denormal-fp-math-f32' attribute: " + S, V); } + + if (TT.isAMDGPU()) { + if (auto A = Attrs.getFnAttr("amdgpu-waves-per-eu"); A.isValid()) { + std::pair<StringRef, StringRef> Strs = A.getValueAsString().split(','); + unsigned Min = 0; + StringRef MinStr = Strs.first.trim(); + Check(!MinStr.getAsInteger(0, Min), + "minimum for 'amdgpu-waves-per-eu' must be integer: " + MinStr); + if (!Strs.second.empty()) { + unsigned Max = 0; + StringRef MaxStr = Strs.second.trim(); + Check(!MaxStr.getAsInteger(0, Max), + "maximum for 'amdgpu-waves-per-eu' must be integer: " + MaxStr); + Check(Max, "maximum for 'amdgpu-waves-per-eu' must be non-zero"); + Check(Min <= Max, "minimum must be less than or equal to maximum for " + "'amdgpu-waves-per-eu': " + + MinStr + " > " + MaxStr); + } else { + Check(Min, "minimum for 'amdgpu-waves-per-eu' must be non-zero when " + "maximum is not provided"); + } + } + } } void Verifier::verifyFunctionMetadata( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index ee887447972bf..52540bac85a52 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1367,6 +1367,10 @@ static bool updateWavesPerEU(Module &M, TargetMachine &TM) { return Changed; } +// 14 ==> 15 +// 15 ==> 16 +// 16 ==> 17 + static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, AMDGPUAttributorOptions Options, ThinOrFullLTOPhase LTOPhase) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index d095fc6cf9549..1fcd0654fba31 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -156,15 +156,15 @@ AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { } } -std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( - const Function &F) const { +std::pair<unsigned, unsigned> +AMDGPUSubtarget::getFlatWorkGroupSizes(const Function &F) const { // Default minimum/maximum flat work group sizes. std::pair<unsigned, unsigned> Default = - getDefaultFlatWorkGroupSize(F.getCallingConv()); + getDefaultFlatWorkGroupSize(F.getCallingConv()); // Requested minimum/maximum flat work group sizes. std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( - F, "amdgpu-flat-work-group-size", Default); + F, "amdgpu-flat-work-group-size", Default); // Make sure requested minimum is less than requested maximum. if (Requested.first > Requested.second) @@ -186,23 +186,29 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU( // sizes limits the achievable maximum, and we aim to support enough waves per // EU so that we can concurrently execute all waves of a single workgroup of // maximum size on a CU. - std::pair<unsigned, unsigned> Default = { + std::pair<unsigned, unsigned> WavesPerEU = { getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second), getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second}; - Default.first = std::min(Default.first, Default.second); - - // Make sure requested minimum is within the default range and lower than the - // requested maximum. The latter must not violate target specification. - if (RequestedWavesPerEU.first < Default.first || - RequestedWavesPerEU.first > Default.second || - RequestedWavesPerEU.first > RequestedWavesPerEU.second || - RequestedWavesPerEU.second > getMaxWavesPerEU()) - return Default; - - // We cannot exceed maximum occupancy implied by flat workgroup size and LDS. - RequestedWavesPerEU.second = - std::min(RequestedWavesPerEU.second, Default.second); - return RequestedWavesPerEU; + WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second); + + // Requested minimum must not violate subtarget's specifications and be no + // greater than maximum. + if (RequestedWavesPerEU.first && + (RequestedWavesPerEU.first < getMinWavesPerEU() || + RequestedWavesPerEU.first > RequestedWavesPerEU.second)) + return WavesPerEU; + // Requested maximum must not violate subtarget's specifications. + if (RequestedWavesPerEU.second > getMaxWavesPerEU()) + return WavesPerEU; + + // A requested maximum may limit both the final minimum and maximum, but + // not increase them. A requested minimum can either decrease or increase the + // default minimum as long as it doesn't exceed the maximum. + WavesPerEU.second = std::min(WavesPerEU.second, RequestedWavesPerEU.second); + if (RequestedWavesPerEU.first) + WavesPerEU.first = RequestedWavesPerEU.first; + WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second); + return WavesPerEU; } std::pair<unsigned, unsigned> @@ -229,7 +235,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes, unsigned LDSBytes, const Function &F) const { // Default minimum/maximum number of waves per execution unit. - std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); + std::pair<unsigned, unsigned> Default(0, getMaxWavesPerEU()); // Requested minimum/maximum number of waves per execution unit. std::pair<unsigned, unsigned> Requested = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 23826cc22250a..dafadaff97930 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -126,7 +126,9 @@ class AMDGPUSubtarget { /// Returns the target minimum/maximum number of waves per EU. This is based /// on the minimum/maximum number of \p RequestedWavesPerEU and further /// limited by the maximum achievable occupancy derived from the range of \p - /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. + /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. A + /// minimum requested waves/EU value of 0 indicates an intent to not restrict + /// the minimum target occupancy. std::pair<unsigned, unsigned> getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU, std::pair<unsigned, unsigned> FlatWorkGroupSizes, diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll index e9fe4f3c618c7..2ab38a9f12a52 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -225,3 +225,15 @@ entry: ret void } attributes #12 = {"amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2,10" "amdgpu-lds-size"="16384"} + +; At most 2 waves per execution unit. +; CHECK-LABEL: {{^}}empty_at_most_2: +; CHECK: SGPRBlocks: 12 +; CHECK: VGPRBlocks: 21 +; CHECK: NumSGPRsForWavesPerEU: 102 +; CHECK: NumVGPRsForWavesPerEU: 85 +define amdgpu_kernel void @empty_at_most_2() #13 { +entry: + ret void +} +attributes #13 = {"amdgpu-waves-per-eu"="0,2"} diff --git a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll index 67061bcb2a785..f8c7b593e6eba 100644 --- a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll @@ -57,5 +57,5 @@ entry: ret void } -attributes #0 = { "amdgpu-waves-per-eu"="1,1" } -attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" } +attributes #0 = { "amdgpu-waves-per-eu"="1" } +attributes #1 = { "amdgpu-waves-per-eu"="1" "amdgpu-flat-work-group-size"="1,1024" } diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir index f5558964d2707..6d34e045e55c5 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir @@ -1033,6 +1033,7 @@ body: | ; GFX908-NEXT: [[V_CVT_I32_F64_e32_252:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_253:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_254:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]] @@ -1060,9 +1061,8 @@ body: | ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_220]], implicit [[V_CVT_I32_F64_e32_221]], implicit [[V_CVT_I32_F64_e32_222]], implicit [[V_CVT_I32_F64_e32_223]], implicit [[V_CVT_I32_F64_e32_224]], implicit [[V_CVT_I32_F64_e32_225]], implicit [[V_CVT_I32_F64_e32_226]], implicit [[V_CVT_I32_F64_e32_227]], implicit [[V_CVT_I32_F64_e32_228]], implicit [[V_CVT_I32_F64_e32_229]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_230]], implicit [[V_CVT_I32_F64_e32_231]], implicit [[V_CVT_I32_F64_e32_232]], implicit [[V_CVT_I32_F64_e32_233]], implicit [[V_CVT_I32_F64_e32_234]], implicit [[V_CVT_I32_F64_e32_235]], implicit [[V_CVT_I32_F64_e32_236]], implicit [[V_CVT_I32_F64_e32_237]], implicit [[V_CVT_I32_F64_e32_238]], implicit [[V_CVT_I32_F64_e32_239]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_240]], implicit [[V_CVT_I32_F64_e32_241]], implicit [[V_CVT_I32_F64_e32_242]], implicit [[V_CVT_I32_F64_e32_243]], implicit [[V_CVT_I32_F64_e32_244]], implicit [[V_CVT_I32_F64_e32_245]], implicit [[V_CVT_I32_F64_e32_246]], implicit [[V_CVT_I32_F64_e32_247]], implicit [[V_CVT_I32_F64_e32_248]], implicit [[V_CVT_I32_F64_e32_249]] - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]], implicit [[V_CVT_I32_F64_e32_255]], implicit [[V_CVT_I32_F64_e32_256]], implicit [[DEF]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]], implicit [[V_CVT_I32_F64_e32_256]], implicit [[V_CVT_I32_F64_e32_255]], implicit [[DEF]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: reduce_spill_archvgpr_above_addressable_limit @@ -1660,6 +1660,7 @@ body: | ; GFX908: bb.0: ; GFX908-NEXT: successors: %bb.1(0x80000000) ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode ; GFX908-NEXT: [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF ; GFX908-NEXT: [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF @@ -1944,9 +1945,8 @@ body: | ; GFX908-NEXT: S_NOP 0, implicit [[DEF220]], implicit [[DEF221]], implicit [[DEF222]], implicit [[DEF223]], implicit [[DEF224]], implicit [[DEF225]], implicit [[DEF226]], implicit [[DEF227]], implicit [[DEF228]], implicit [[DEF229]] ; GFX908-NEXT: S_NOP 0, implicit [[DEF230]], implicit [[DEF231]], implicit [[DEF232]], implicit [[DEF233]], implicit [[DEF234]], implicit [[DEF235]], implicit [[DEF236]], implicit [[DEF237]], implicit [[DEF238]], implicit [[DEF239]] ; GFX908-NEXT: S_NOP 0, implicit [[DEF240]], implicit [[DEF241]], implicit [[DEF242]], implicit [[DEF243]], implicit [[DEF244]], implicit [[DEF245]], implicit [[DEF246]], implicit [[DEF247]], implicit [[DEF248]], implicit [[DEF249]] - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode - ; GFX908-NEXT: S_NOP 0, implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[DEF256]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]] + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[DEF256]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: reduce_spill_agpr_above_addressable_limit diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll index b87d266cc2514..5e1cae0760c36 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 2 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s -; Check propagation of amdgpu-flat-work-group-size attribute. +; Check propagation of amdgpu-waves-per-eu attribute. ; Called from a single kernel with 1,8 define internal void @default_to_1_8_a() { @@ -216,41 +216,30 @@ define internal i32 @bitcasted_function() { ret i32 0 } -define internal void @called_from_invalid_bounds_0() { -; CHECK-LABEL: define internal void @called_from_invalid_bounds_0 -; CHECK-SAME: () #[[ATTR1]] { -; CHECK-NEXT: ret void -; +define internal void @called_without_min_waves() { ret void } -define internal void @called_from_invalid_bounds_1() { -; CHECK-LABEL: define internal void @called_from_invalid_bounds_1 +define internal void @called_from_invalid_bounds() { +; CHECK-LABEL: define internal void @called_from_invalid_bounds ; CHECK-SAME: () #[[ATTR10:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void } -; Invalid range for amdgpu-waves-per-eu -define amdgpu_kernel void @kernel_invalid_bounds_0_8() #9 { -; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_0_8 -; CHECK-SAME: () #[[ATTR1]] { -; CHECK-NEXT: call void @called_from_invalid_bounds_0() -; CHECK-NEXT: ret void -; - call void @called_from_invalid_bounds_0() +define internal void @called_from_invalid_bounds_1() { + call void @called_without_min_waves() ret void } - ; Invalid range for amdgpu-waves-per-eu define amdgpu_kernel void @kernel_invalid_bounds_1_123() #10 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_1_123 ; CHECK-SAME: () #[[ATTR11:[0-9]+]] { -; CHECK-NEXT: call void @called_from_invalid_bounds_1() +; CHECK-NEXT: call void @called_from_invalid_bounds() ; CHECK-NEXT: ret void ; - call void @called_from_invalid_bounds_1() + call void @called_from_invalid_bounds() ret void } @@ -279,7 +268,7 @@ define amdgpu_kernel void @kernel_3_6() #12 { ; 3,6 -> 6,9 define internal void @refine_upper_func_3_6() #13 { ; CHECK-LABEL: define internal void @refine_upper_func_3_6 -; CHECK-SAME: () #[[ATTR9]] { +; CHECK-SAME: () #[[ATTR14:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -288,7 +277,7 @@ define internal void @refine_upper_func_3_6() #13 { ; 4,8 -> 6,8 define internal void @refine_lower_func_4_8() #14 { ; CHECK-LABEL: define internal void @refine_lower_func_4_8 -; CHECK-SAME: () #[[ATTR14:[0-9]+]] { +; CHECK-SAME: () #[[ATTR15:[0-9]+]] { ; CHECK-NEXT: call void @refine_upper_func_3_6() ; CHECK-NEXT: ret void ; @@ -298,7 +287,7 @@ define internal void @refine_lower_func_4_8() #14 { define amdgpu_kernel void @kernel_foo_6_8() #15 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_foo_6_8 -; CHECK-SAME: () #[[ATTR15:[0-9]+]] { +; CHECK-SAME: () #[[ATTR16:[0-9]+]] { ; CHECK-NEXT: call void @refine_upper_func_3_6() ; CHECK-NEXT: call void @refine_lower_func_4_8() ; CHECK-NEXT: call void @func_9_10_a() @@ -313,7 +302,7 @@ define amdgpu_kernel void @kernel_foo_6_8() #15 { ; 5,5 -> 5,5 define internal void @func_5_5() #16 { ; CHECK-LABEL: define internal void @func_5_5 -; CHECK-SAME: () #[[ATTR16:[0-9]+]] { +; CHECK-SAME: () #[[ATTR17:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -322,7 +311,7 @@ define internal void @func_5_5() #16 { ; 5,8 -> 8,8 define internal void @func_5_8() #17 { ; CHECK-LABEL: define internal void @func_5_8 -; CHECK-SAME: () #[[ATTR17:[0-9]+]] { +; CHECK-SAME: () #[[ATTR18:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -331,7 +320,7 @@ define internal void @func_5_8() #17 { ; 9,10 -> 9,10 define internal void @func_9_10_a() #18 { ; CHECK-LABEL: define internal void @func_9_10_a -; CHECK-SAME: () #[[ATTR18:[0-9]+]] { +; CHECK-SAME: () #[[ATTR19:[0-9]+]] { ; CHECK-NEXT: ret void ; ret void @@ -340,7 +329,7 @@ define internal void @func_9_10_a() #18 { ; 9,10 -> 9,9 define internal void @func_9_10_b() #18 { ; CHECK-LABEL: define internal void @func_9_10_b -; CHECK-SAME: () #[[ATTR18]] { +; CHECK-SAME: () #[[ATTR19]] { ; CHECK-NEXT: ret void ; ret void @@ -348,7 +337,7 @@ define internal void @func_9_10_b() #18 { define amdgpu_kernel void @kernel_bar_8_9() #19 { ; CHECK-LABEL: define amdgpu_kernel void @kernel_bar_8_9 -; CHECK-SAME: () #[[ATTR19:[0-9]+]] { +; CHECK-SAME: () #[[ATTR20:[0-9]+]] { ; CHECK-NEXT: call void @refine_upper_func_3_6() ; CHECK-NEXT: call void @func_5_5() ; CHECK-NEXT: call void @func_9_10_b() @@ -413,10 +402,11 @@ attributes #19 = { "amdgpu-waves-per-eu"="8,9" } ; CHECK: attributes #[[ATTR11]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR12]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR13]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR14]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR15]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR16]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR17]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR18]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR19]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR14]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR15]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR16]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR17]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR18]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR19]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR20]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/Verifier/AMDGPU/amdgpu-waves-per-eu.ll b/llvm/test/Verifier/AMDGPU/amdgpu-waves-per-eu.ll new file mode 100644 index 0000000000000..8686a4278147e --- /dev/null +++ b/llvm/test/Verifier/AMDGPU/amdgpu-waves-per-eu.ll @@ -0,0 +1,40 @@ +; RUN: not llvm-as -disable-output %s 2>&1 | FileCheck %s + +target triple = "amdgcn-amd-amdhsa" + +define void @valid_amdgpu_waves_per_eu_range() "amdgpu-waves-per-eu"="2,4" { + ret void +} + +define void @valid_amdgpu_waves_per_eu_min_only() "amdgpu-waves-per-eu"="2" { + ret void +} + +define void @valid_amdgpu_waves_per_eu_max_only() "amdgpu-waves-per-eu"="0,4" { + ret void +} + +; CHECK: minimum for 'amdgpu-waves-per-eu' must be integer: x +define void @invalid_amdgpu_waves_per_eu_min_nan() "amdgpu-waves-per-eu"="x" { + ret void +} + +; CHECK: maximum for 'amdgpu-waves-per-eu' must be integer: x +define void @invalid_amdgpu_waves_per_eu_max_nan() "amdgpu-waves-per-eu"="0,x" { + ret void +} + +; CHECK: minimum for 'amdgpu-waves-per-eu' must be non-zero when maximum is not provided +define void @invalid_amdgpu_waves_per_eu_min_zero() "amdgpu-waves-per-eu"="0" { + ret void +} + +; CHECK: maximum for 'amdgpu-waves-per-eu' must be non-zero +define void @invalid_amdgpu_waves_per_eu_max_zero() "amdgpu-waves-per-eu"="2,0" { + ret void +} + +; CHECK: minimum must be less than or equal to maximum for 'amdgpu-waves-per-eu': 2 > 1 +define void @invalid_amdgpu_waves_per_eu_max_lt_min() "amdgpu-waves-per-eu"="2,1" { + ret void +} >From 580f9a99337a24fad9b5f93edd25202df420ddeb Mon Sep 17 00:00:00 2001 From: Lucas Ramirez <lucas.r...@proton.me> Date: Thu, 22 May 2025 13:59:39 +0000 Subject: [PATCH 2/2] Remove spurious comment --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 52540bac85a52..ee887447972bf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -1367,10 +1367,6 @@ static bool updateWavesPerEU(Module &M, TargetMachine &TM) { return Changed; } -// 14 ==> 15 -// 15 ==> 16 -// 16 ==> 17 - static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM, AMDGPUAttributorOptions Options, ThinOrFullLTOPhase LTOPhase) { _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits