llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Diana Picus (rovka) <details> <summary>Changes</summary> When using the `amdgcn.init.whole.wave` intrinsic, we add dummy VGPR arguments with the purpose of preserving their inactive lanes. The pattern may look something like this: ``` entry: call amdgcn.init.whole.wave branch to shader or tail shader: $vInactive = IMPLICIT_DEF ; Tells regalloc it's safe to use the active lanes actual code... tail: call amdgcn.cs.chain [...], implicit $vInactive ``` We should not report these VGPRs in the `.vgpr_count` metadata. This patch achieves that goal by ignoring IMPLICIT_DEFs and calls. This should be safe since if those registers are actually used in any other context, they will be counted there. It also reduces the scope of the code that counts unused function arguments to only work on entry functions, since only they need to handle hardware-initialized registers. This is a reworked version of #<!-- -->133242, which was reverted in #<!-- -->144039. --- Patch is 29.65 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149052.diff 12 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (+1-1) - (modified) llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp (+3) - (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+22) - (added) llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll (+76) - (added) llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll (+50) - (added) llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll (+78) - (added) llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll (+75) - (modified) llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll (+2-2) - (added) llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll (+30) - (added) llvm/test/CodeGen/AMDGPU/vgpr-count-graphics-chain.ll (+27) - (added) llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll (+25) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index c0920e3e71bee..14c392b2b2250 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -993,7 +993,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // dispatch registers are function args. unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0; - if (isShader(F.getCallingConv())) { + if (AMDGPU::shouldReportUnusedFuncArgs(F.getCallingConv())) { bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index ec4daa2cf662a..bb0d2027c71f4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -213,6 +213,9 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( if (!RC || !TRI.isVGPRClass(RC)) continue; + if (MI.isCall() || MI.isImplicitDef()) + continue; + unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32); unsigned HWReg = TRI.getHWRegIndex(Reg); int MaxUsed = HWReg + Width - 1; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index e6078d6918ac2..bc06c68d968c6 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1367,6 +1367,28 @@ constexpr bool isEntryFunctionCC(CallingConv::ID CC) { } } +// Shaders that are entry functions need to count input arguments even if +// they're not used (i.e. not reported by AMDGPUResourceUsageAnalysis). Other +// functions can skip including them. This is especially important for shaders +// that use the init.whole.wave intrinsic, since they sometimes have VGPR +// arguments that are only added for the purpose of preserving their inactive +// lanes and should not be included in the vgpr-count. +LLVM_READNONE +constexpr bool shouldReportUnusedFuncArgs(CallingConv::ID CC) { + switch (CC) { + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_LS: + case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_ES: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + return true; + default: + return false; + } +} + LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC) { switch (CC) { diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll new file mode 100644 index 0000000000000..e47f5e25ead3a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll @@ -0,0 +1,76 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s + +; CHECK-LABEL: .shader_functions: + +; Use VGPRs above the input arguments. +; CHECK-LABEL: _miss_1: +; CHECK: .vgpr_count:{{.*}}0x1d{{$}} + +define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count, + i32 %vcr, { i32 } %system.data, + i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3, + i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7, + i32 %inactive.vgpr8, i32 %inactive.vgpr9) + local_unnamed_addr { +entry: + %system.data.value = extractvalue { i32 } %system.data, 0 + %dead.val = call i32 @llvm.amdgcn.dead.i32() + %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave() + br i1 %is.whole.wave, label %shader, label %tail + +shader: + %system.data.extract = extractvalue { i32 } %system.data, 0 + %data.mul = mul i32 %system.data.extract, 2 + %data.add = add i32 %data.mul, 1 + call void asm sideeffect "; clobber v28", "~{v28}"() + br label %tail + +tail: + %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ] + %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ] + %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ] + %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ] + %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ] + %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ] + %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ] + %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ] + %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ] + %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ] + %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ] + %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ] + + %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0 + %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1 + %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2 + %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3 + %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4 + %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5 + %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6 + %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7 + %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8 + %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9 + %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10 + %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11 + + %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0 + %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1 + %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2 + %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3 + + call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...) + @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s( + ptr %next.callee, i32 0, <4 x i32> inreg %final.vec, + { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct, + i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32) + unreachable +} + +declare i32 @llvm.amdgcn.dead.i32() +declare i1 @llvm.amdgcn.init.whole.wave() +declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...) + +declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg) + +!amdgpu.pal.metadata.msgpack = !{!0} + +!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"} \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll new file mode 100644 index 0000000000000..5d7472fd3c56e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll @@ -0,0 +1,50 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s + +; CHECK-LABEL: .shader_functions: + +; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers. +; CHECK-LABEL: leaf_shader: +; CHECK: .vgpr_count:{{.*}}0xc{{$}} + +; Function without calls. +define amdgpu_cs_chain void @_leaf_shader(ptr %output.ptr, i32 inreg %input.value, + i32 %active.vgpr1, i32 %active.vgpr2, + i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3, + i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6) + local_unnamed_addr { +entry: + %dead.val = call i32 @llvm.amdgcn.dead.i32() + %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave() + br i1 %is.whole.wave, label %compute, label %merge + +compute: + ; Perform a more complex computation using active VGPRs + %square = mul i32 %active.vgpr1, %active.vgpr1 + %product = mul i32 %square, %active.vgpr2 + %sum = add i32 %product, %input.value + %result = add i32 %sum, 42 + br label %merge + +merge: + %final.result = phi i32 [ 0, %entry ], [ %result, %compute ] + %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %compute ] + %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %compute ] + %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %compute ] + %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %compute ] + %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %compute ] + %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %compute ] + + store i32 %final.result, ptr %output.ptr, align 4 + + ret void +} + +declare i32 @llvm.amdgcn.dead.i32() +declare i1 @llvm.amdgcn.init.whole.wave() +declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...) + +declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg) + +!amdgpu.pal.metadata.msgpack = !{!0} + +!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"} diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll new file mode 100644 index 0000000000000..f1f7fb22d44c6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll @@ -0,0 +1,78 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s + +; CHECK-LABEL: .shader_functions: + +; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers. +; The shader is free to use any of the VGPRs mapped to a %inactive.vgpr as long as it only touches its active lanes. +; In that case, the VGPR should be included in the .vgpr_count +; CHECK-LABEL: _miss_1: +; CHECK: .vgpr_count:{{.*}}0xd{{$}} + +define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count, + i32 %vcr, { i32 } %system.data, + i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3, + i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7, + i32 %inactive.vgpr8, i32 %inactive.vgpr9) + local_unnamed_addr { +entry: + %system.data.value = extractvalue { i32 } %system.data, 0 + %dead.val = call i32 @llvm.amdgcn.dead.i32() + %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave() + br i1 %is.whole.wave, label %shader, label %tail + +shader: + %system.data.extract = extractvalue { i32 } %system.data, 0 + %data.mul = mul i32 %system.data.extract, 2 + %data.add = add i32 %data.mul, 1 + call void asm sideeffect "; use VGPR for %inactive.vgpr2", "~{v12}"() + br label %tail + +tail: + %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ] + %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ] + %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ] + %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ] + %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ] + %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ] + %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ] + %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ] + %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ] + %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ] + %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ] + %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ] + + %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0 + %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1 + %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2 + %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3 + %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4 + %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5 + %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6 + %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7 + %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8 + %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9 + %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10 + %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11 + + %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0 + %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1 + %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2 + %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3 + + call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...) + @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s( + ptr %next.callee, i32 0, <4 x i32> inreg %final.vec, + { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct, + i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32) + unreachable +} + +declare i32 @llvm.amdgcn.dead.i32() +declare i1 @llvm.amdgcn.init.whole.wave() +declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...) + +declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg) + +!amdgpu.pal.metadata.msgpack = !{!0} + +!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"} diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll new file mode 100644 index 0000000000000..b9130dd1b7ed4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll @@ -0,0 +1,75 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s + +; CHECK-LABEL: .shader_functions: + +; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers. +; CHECK-LABEL: _miss_1: +; CHECK: .vgpr_count:{{.*}}0xa{{$}} + +define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count, + i32 %vcr, { i32 } %system.data, + i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3, + i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7, + i32 %inactive.vgpr8, i32 %inactive.vgpr9) + local_unnamed_addr { +entry: + %system.data.value = extractvalue { i32 } %system.data, 0 + %dead.val = call i32 @llvm.amdgcn.dead.i32() + %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave() + br i1 %is.whole.wave, label %shader, label %tail + +shader: + %system.data.extract = extractvalue { i32 } %system.data, 0 + %data.mul = mul i32 %system.data.extract, 2 + %data.add = add i32 %data.mul, 1 + br label %tail + +tail: + %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ] + %final.sys.data = phi i32 [ %system.data.value, %entry ], [ ... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/149052 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits