https://github.com/linuxrocks123 updated https://github.com/llvm/llvm-project/pull/164847
>From ddda6473ab7ae8485a906a749eebad0853b857ca Mon Sep 17 00:00:00 2001 From: Patrick Simmons <[email protected]> Date: Thu, 23 Oct 2025 11:50:32 -0500 Subject: [PATCH 01/11] Initial work --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 3 ++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 ++++ .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 43 +++++++++++++++++++ llvm/lib/Target/AMDGPU/SOPInstructions.td | 8 +++- 4 files changed, 60 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 8428fa97fe445..f17156f8a24ab 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -63,6 +63,9 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc") BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc") BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc") +BUILTIN(__builtin_amdgcn_bcnt032_lo, "UiUi", "nc") +BUILTIN(__builtin_amdgcn_bcnt064_lo, "UiWUi", "nc") + TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst") //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 9e334d4316336..50b43a1c927ce 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2359,6 +2359,14 @@ def int_amdgcn_mbcnt_hi : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; +def int_amdgcn_bcnt032_lo : + ClangBuiltin<"__builtin_amdgcn_bcnt032_lo">, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + +def int_amdgcn_bcnt064_lo : + ClangBuiltin<"__builtin_amdgcn_bcnt064_lo">, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; + // llvm.amdgcn.ds.swizzle src offset def int_amdgcn_ds_swizzle : ClangBuiltin<"__builtin_amdgcn_ds_swizzle">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 8e35ba77d69aa..39b558694edf8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" @@ -35,6 +36,7 @@ #include "llvm/Support/KnownFPClass.h" #include "llvm/Transforms/Utils/IntegerDivision.h" #include "llvm/Transforms/Utils/Local.h" +#include <cstdint> #define DEBUG_TYPE "amdgpu-codegenprepare" @@ -93,6 +95,13 @@ static cl::opt<bool> DisableFDivExpand( cl::ReallyHidden, cl::init(false)); +// Disable processing of fdiv so we can better test the backend implementations. +static cl::opt<bool> + DisableBcnt0("amdgpu-codegenprepare-disable-bcnt0", + cl::desc("Prevent transforming bitsin(typeof(x)) - " + "popcount(x) to bcnt0(x) in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, cl::init(false)); + class AMDGPUCodeGenPrepareImpl : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> { public: @@ -258,6 +267,7 @@ class AMDGPUCodeGenPrepareImpl bool visitAddrSpaceCastInst(AddrSpaceCastInst &I); bool visitIntrinsicInst(IntrinsicInst &I); + bool visitCtpop(IntrinsicInst &I); bool visitFMinLike(IntrinsicInst &I); bool visitSqrt(IntrinsicInst &I); bool run(); @@ -1910,6 +1920,8 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { return visitFMinLike(I); case Intrinsic::sqrt: return visitSqrt(I); + case Intrinsic::ctpop: + return visitCtpop(I); default: return false; } @@ -1977,6 +1989,37 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder, return insertValues(Builder, FractArg->getType(), ResultVals); } +bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) { + uint32_t BitWidth, DestinationWidth, IntrinsicWidth; + if (!I.hasOneUse() || + !ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth())) + return false; + + BinaryOperator *MustBeSub = dyn_cast<BinaryOperator>(I.user_back()); + if (!MustBeSub || MustBeSub->getOpcode() != BinaryOperator::Sub) + return false; + + ConstantInt *FirstOperand = dyn_cast<ConstantInt>(MustBeSub->getOperand(0)); + if (!FirstOperand || FirstOperand->getZExtValue() != BitWidth) + return false; + + IRBuilder<> Builder(MustBeSub); + Instruction *TransformedIns = + Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt064_lo + : Intrinsic::amdgcn_bcnt032_lo, + {}, {I.getArgOperand(0)}); + + if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) != + (IntrinsicWidth = TransformedIns->getType()->getIntegerBitWidth())) + TransformedIns = cast<Instruction>(Builder.CreateZExtOrTrunc( + TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth))); + + MustBeSub->replaceAllUsesWith(TransformedIns); + TransformedIns->takeName(MustBeSub); + MustBeSub->eraseFromParent(); + return true; +} + bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) { Value *FractArg = matchFractPat(I); if (!FractArg) diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 84287b621fe78..29104d33a8aa8 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -264,8 +264,12 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64", } // End isReMaterializable = 1, isAsCheapAsAMove = 1 let Defs = [SCC] in { -def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">; -def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">; +def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32", + [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt032_lo> i32:$src0))] +>; +def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64", + [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt064_lo> i64:$src0))] +>; def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32", [(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))] >; >From 249ee64fd6ec23cb65433a5dc56145f3effa158d Mon Sep 17 00:00:00 2001 From: Patrick Simmons <[email protected]> Date: Thu, 23 Oct 2025 14:20:42 -0500 Subject: [PATCH 02/11] Update testcases --- llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 38 +++++++++++++---------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index dd5f838b4a206..db030d2b19d90 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -444,16 +444,14 @@ define amdgpu_ps i32 @bfe_u64(i64 inreg %val0) { define amdgpu_ps i32 @bcnt032(i32 inreg %val0) { ; CHECK-LABEL: bcnt032: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0 -; CHECK-NEXT: s_sub_i32 s0, 32, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ; return to shader part epilog +; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone %result2 = sub i32 32, %result call void asm "; use $0", "s"(i32 %result2) @@ -465,17 +463,15 @@ define amdgpu_ps i32 @bcnt032(i32 inreg %val0) { define amdgpu_ps i32 @bcnt064(i64 inreg %val0) { ; CHECK-LABEL: bcnt064: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; CHECK-NEXT: s_sub_u32 s0, 64, s0 -; CHECK-NEXT: s_subb_u32 s1, 0, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ; return to shader part epilog +; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone %result2 = sub i64 64, %result call void asm "; use $0", "s"(i64 %result2) >From 5bd7c7b2045c7669d8d326d8bc3ca4216dda6597 Mon Sep 17 00:00:00 2001 From: Patrick Simmons <[email protected]> Date: Thu, 23 Oct 2025 17:31:31 -0500 Subject: [PATCH 03/11] Don't perform optimization on vector types --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 39b558694edf8..8f13fa79d3637 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -1991,7 +1991,7 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder, bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) { uint32_t BitWidth, DestinationWidth, IntrinsicWidth; - if (!I.hasOneUse() || + if (!I.hasOneUse() || !I.getType()->isIntegerTy() || !ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth())) return false; >From 1030ef31f96040975f02191af0a5a57374c5e0e9 Mon Sep 17 00:00:00 2001 From: Patrick Simmons <[email protected]> Date: Thu, 23 Oct 2025 18:29:17 -0500 Subject: [PATCH 04/11] Review changes --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 ++-- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 ++++---- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 6 +++--- llvm/lib/Target/AMDGPU/SOPInstructions.td | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index f17156f8a24ab..f18d1f8df0b71 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -63,8 +63,8 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc") BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc") BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc") -BUILTIN(__builtin_amdgcn_bcnt032_lo, "UiUi", "nc") -BUILTIN(__builtin_amdgcn_bcnt064_lo, "UiWUi", "nc") +BUILTIN(__builtin_amdgcn_bcnt32_lo, "UiUi", "nc") +BUILTIN(__builtin_amdgcn_bcnt64_lo, "UiWUi", "nc") TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst") diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 50b43a1c927ce..476f0bcb42b31 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2359,12 +2359,12 @@ def int_amdgcn_mbcnt_hi : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; -def int_amdgcn_bcnt032_lo : - ClangBuiltin<"__builtin_amdgcn_bcnt032_lo">, +def int_amdgcn_bcnt32_lo : + ClangBuiltin<"__builtin_amdgcn_bcnt32_lo">, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; -def int_amdgcn_bcnt064_lo : - ClangBuiltin<"__builtin_amdgcn_bcnt064_lo">, +def int_amdgcn_bcnt64_lo : + ClangBuiltin<"__builtin_amdgcn_bcnt64_lo">, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; // llvm.amdgcn.ds.swizzle src offset diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 8f13fa79d3637..169541d9d45f6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -95,7 +95,7 @@ static cl::opt<bool> DisableFDivExpand( cl::ReallyHidden, cl::init(false)); -// Disable processing of fdiv so we can better test the backend implementations. +// Disable bitsin(typeof(x)) - popcnt(x) to s_bcnt0(x) transformation. static cl::opt<bool> DisableBcnt0("amdgpu-codegenprepare-disable-bcnt0", cl::desc("Prevent transforming bitsin(typeof(x)) - " @@ -2005,8 +2005,8 @@ bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) { IRBuilder<> Builder(MustBeSub); Instruction *TransformedIns = - Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt064_lo - : Intrinsic::amdgcn_bcnt032_lo, + Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt64_lo + : Intrinsic::amdgcn_bcnt32_lo, {}, {I.getArgOperand(0)}); if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) != diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 29104d33a8aa8..00d5cab2de479 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -265,10 +265,10 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64", let Defs = [SCC] in { def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32", - [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt032_lo> i32:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt32_lo> i32:$src0))] >; def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64", - [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt064_lo> i64:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt64_lo> i64:$src0))] >; def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32", [(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))] >From 165f82de021625f430571cdeb6894fb3acf42cba Mon Sep 17 00:00:00 2001 From: Patrick Simmons <[email protected]> Date: Fri, 24 Oct 2025 14:12:23 -0500 Subject: [PATCH 05/11] Review changes: - Add tests - Remove builtin (users will need inline assembly if pattern match fails) --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 3 - llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 - llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 109 +++++++++++++++++++ 3 files changed, 109 insertions(+), 5 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index f18d1f8df0b71..8428fa97fe445 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -63,9 +63,6 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc") BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc") BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc") -BUILTIN(__builtin_amdgcn_bcnt32_lo, "UiUi", "nc") -BUILTIN(__builtin_amdgcn_bcnt64_lo, "UiWUi", "nc") - TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst") //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 476f0bcb42b31..ca4abe29dd96a 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2360,11 +2360,9 @@ def int_amdgcn_mbcnt_hi : [IntrNoMem]>; def int_amdgcn_bcnt32_lo : - ClangBuiltin<"__builtin_amdgcn_bcnt32_lo">, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_amdgcn_bcnt64_lo : - ClangBuiltin<"__builtin_amdgcn_bcnt64_lo">, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; // llvm.amdgcn.ds.swizzle src offset diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index db030d2b19d90..a9516057be1ef 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -621,3 +621,112 @@ if: endif: ret i32 1 } + +define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; CHECK-LABEL: bcnt032_not_for_vregs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshl_b32 s0, s0, 2 +; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; CHECK-NEXT: global_load_dword v2, v[2:3], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0 +; CHECK-NEXT: v_sub_u32_e32 v3, 32, v2 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use v3 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: global_store_dword v[0:1], v2, off +; CHECK-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid + %val0 = load volatile i32, ptr addrspace(1) %gep + %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone + %result2 = sub i32 32, %result + call void asm "; use $0", "s"(i32 %result2) + %cmp = icmp ne i32 %result2, 0 + %zext = zext i1 %cmp to i32 + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; CHECK-LABEL: bcnt064_not_for_vregs: +; CHECK: ; %bb.0: +; CHECK-NEXT: b32 s0, s0, 2 +; CHECK-NEXT: o_u32_e32 v2, vcc, s0, v2 +; CHECK-NEXT: co_u32_e32 v3, vcc, 0, v3, vcc +; CHECK-NEXT: load_dwordx2 v[2:3], v[2:3], off glc +; CHECK-NEXT: nt vmcnt(0) +; CHECK-NEXT: 32_e32 v4, 0 +; CHECK-NEXT: u32_b32 v2, v2, 0 +; CHECK-NEXT: u32_b32 v3, v3, v2 +; CHECK-NEXT: o_u32_e32 v5, vcc, 64, v3 +; CHECK-NEXT: co_u32_e64 v6, s[0:1], 0, 0, vcc +; CHECK-NEXT: TART +; CHECK-NEXT: [5:6] +; CHECK-NEXT: ND +; CHECK-NEXT: store_dwordx2 v[0:1], v[3:4], off +; CHECK-NEXT: m + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid + %val0 = load volatile i64, ptr addrspace(1) %gep + %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone + %result2 = sub i64 64, %result + call void asm "; use $0", "s"(i64 %result2) + %cmp = icmp ne i64 %result2, 0 + %zext = zext i1 %cmp to i32 + store i64 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) { +; CHECK-LABEL: bcnt032_ctpop_multiple_uses: +; CHECK: ; %bb.0: +; CHECK-NEXT: _i32_b32 s0, s0 +; CHECK-NEXT: 32 s1, 32, s0 +; CHECK-NEXT: g_u32 s1, 0 +; CHECK-NEXT: TART +; CHECK-NEXT: 0 +; CHECK-NEXT: ND +; CHECK-NEXT: TART +; CHECK-NEXT: 1 +; CHECK-NEXT: ND +; CHECK-NEXT: ct_b64 s[0:1], -1, 0 +; CHECK-NEXT: sk_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: irstlane_b32 s0, v0 +; CHECK-NEXT: n to shader part epilog + %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone + %result2 = sub i32 32, %result + call void asm "; use $0", "s"(i32 %result) + call void asm "; use $0", "s"(i32 %result2) + %cmp = icmp ne i32 %result2, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) { +; CHECK-LABEL: bcnt064_ctpop_multiple_uses: +; CHECK: ; %bb.0: +; CHECK-NEXT: _i32_b64 s0, s[0:1] +; CHECK-NEXT: 32 s2, 64, s0 +; CHECK-NEXT: u32 s3, 0, 0 +; CHECK-NEXT: 32 s1, 0 +; CHECK-NEXT: g_u64 s[2:3], 0 +; CHECK-NEXT: TART +; CHECK-NEXT: [0:1] +; CHECK-NEXT: ND +; CHECK-NEXT: ct_b64 s[0:1], -1, 0 +; CHECK-NEXT: sk_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: irstlane_b32 s0, v0 +; CHECK-NEXT: TART +; CHECK-NEXT: [2:3] +; CHECK-NEXT: ND +; CHECK-NEXT: n to shader part epilog + %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone + %result2 = sub i64 64, %result + call void asm "; use $0", "s"(i64 %result) + call void asm "; use $0", "s"(i64 %result2) + %cmp = icmp ne i64 %result2, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} \ No newline at end of file >From 168a5e33042afbc49c7d7063248ea32e49c7e3b5 Mon Sep 17 00:00:00 2001 From: Patrick Simmons <[email protected]> Date: Mon, 27 Oct 2025 14:18:45 -0500 Subject: [PATCH 06/11] Reviewer-suggested refactoring --- .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 169541d9d45f6..94dcba7aab3e2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -34,6 +34,7 @@ #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/KnownFPClass.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/IntegerDivision.h" #include "llvm/Transforms/Utils/Local.h" #include <cstdint> @@ -1990,17 +1991,16 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder, } bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) { - uint32_t BitWidth, DestinationWidth, IntrinsicWidth; - if (!I.hasOneUse() || !I.getType()->isIntegerTy() || - !ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth())) + uint32_t BitWidth, DestinationWidth; + if (!I.hasOneUse() || !I.getType()->isIntegerTy()) return false; - BinaryOperator *MustBeSub = dyn_cast<BinaryOperator>(I.user_back()); - if (!MustBeSub || MustBeSub->getOpcode() != BinaryOperator::Sub) + BitWidth = I.getType()->getIntegerBitWidth(); + if(!ST.hasBCNT(BitWidth)) return false; - ConstantInt *FirstOperand = dyn_cast<ConstantInt>(MustBeSub->getOperand(0)); - if (!FirstOperand || FirstOperand->getZExtValue() != BitWidth) + Instruction *MustBeSub = I.user_back(); + if (!match(MustBeSub, m_Sub(m_SpecificInt(BitWidth), m_Specific(&I)))) return false; IRBuilder<> Builder(MustBeSub); @@ -2009,14 +2009,12 @@ bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) { : Intrinsic::amdgcn_bcnt32_lo, {}, {I.getArgOperand(0)}); - if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) != - (IntrinsicWidth = TransformedIns->getType()->getIntegerBitWidth())) - TransformedIns = cast<Instruction>(Builder.CreateZExtOrTrunc( - TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth))); + DestinationWidth = MustBeSub->getType()->getIntegerBitWidth(); + TransformedIns = cast<Instruction>(Builder.CreateZExtOrTrunc( + TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth))); - MustBeSub->replaceAllUsesWith(TransformedIns); - TransformedIns->takeName(MustBeSub); - MustBeSub->eraseFromParent(); + BasicBlock::iterator SubIt = MustBeSub->getIterator(); + ReplaceInstWithValue(SubIt,TransformedIns); return true; } >From 9dd73e67b8982bfe97ec6a75d855a9c135615fd9 Mon Sep 17 00:00:00 2001 From: Patrick Simmons <[email protected]> Date: Tue, 28 Oct 2025 17:30:52 -0500 Subject: [PATCH 07/11] Revert implementation --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 6 --- .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 41 ------------------- 2 files changed, 47 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index ca4abe29dd96a..9e334d4316336 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2359,12 +2359,6 @@ def int_amdgcn_mbcnt_hi : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; -def int_amdgcn_bcnt32_lo : - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - -def int_amdgcn_bcnt64_lo : - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; - // llvm.amdgcn.ds.swizzle src offset def int_amdgcn_ds_swizzle : ClangBuiltin<"__builtin_amdgcn_ds_swizzle">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 94dcba7aab3e2..8e35ba77d69aa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -26,7 +26,6 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" @@ -34,10 +33,8 @@ #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/KnownFPClass.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/IntegerDivision.h" #include "llvm/Transforms/Utils/Local.h" -#include <cstdint> #define DEBUG_TYPE "amdgpu-codegenprepare" @@ -96,13 +93,6 @@ static cl::opt<bool> DisableFDivExpand( cl::ReallyHidden, cl::init(false)); -// Disable bitsin(typeof(x)) - popcnt(x) to s_bcnt0(x) transformation. -static cl::opt<bool> - DisableBcnt0("amdgpu-codegenprepare-disable-bcnt0", - cl::desc("Prevent transforming bitsin(typeof(x)) - " - "popcount(x) to bcnt0(x) in AMDGPUCodeGenPrepare"), - cl::ReallyHidden, cl::init(false)); - class AMDGPUCodeGenPrepareImpl : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> { public: @@ -268,7 +258,6 @@ class AMDGPUCodeGenPrepareImpl bool visitAddrSpaceCastInst(AddrSpaceCastInst &I); bool visitIntrinsicInst(IntrinsicInst &I); - bool visitCtpop(IntrinsicInst &I); bool visitFMinLike(IntrinsicInst &I); bool visitSqrt(IntrinsicInst &I); bool run(); @@ -1921,8 +1910,6 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { return visitFMinLike(I); case Intrinsic::sqrt: return visitSqrt(I); - case Intrinsic::ctpop: - return visitCtpop(I); default: return false; } @@ -1990,34 +1977,6 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder, return insertValues(Builder, FractArg->getType(), ResultVals); } -bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) { - uint32_t BitWidth, DestinationWidth; - if (!I.hasOneUse() || !I.getType()->isIntegerTy()) - return false; - - BitWidth = I.getType()->getIntegerBitWidth(); - if(!ST.hasBCNT(BitWidth)) - return false; - - Instruction *MustBeSub = I.user_back(); - if (!match(MustBeSub, m_Sub(m_SpecificInt(BitWidth), m_Specific(&I)))) - return false; - - IRBuilder<> Builder(MustBeSub); - Instruction *TransformedIns = - Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt64_lo - : Intrinsic::amdgcn_bcnt32_lo, - {}, {I.getArgOperand(0)}); - - DestinationWidth = MustBeSub->getType()->getIntegerBitWidth(); - TransformedIns = cast<Instruction>(Builder.CreateZExtOrTrunc( - TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth))); - - BasicBlock::iterator SubIt = MustBeSub->getIterator(); - ReplaceInstWithValue(SubIt,TransformedIns); - return true; -} - bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) { Value *FractArg = matchFractPat(I); if (!FractArg) >From c3d205ab0ebcd3cdd1f555679b52b4a756c15855 Mon Sep 17 00:00:00 2001 From: Patrick Simmons <[email protected]> Date: Tue, 28 Oct 2025 19:11:33 -0500 Subject: [PATCH 08/11] Use S-expressions instead --- llvm/lib/Target/AMDGPU/SOPInstructions.td | 13 ++++-- llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 56 +++++++++++------------ 2 files changed, 37 insertions(+), 32 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 00d5cab2de479..9e284d8ab7a48 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -265,11 +265,9 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64", let Defs = [SCC] in { def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32", - [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt32_lo> i32:$src0))] ->; -def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64", - [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt64_lo> i64:$src0))] + [(set i32:$sdst, (UniformBinFrag<sub> 32, (UniformUnaryFrag<ctpop> i32:$src0)))] >; +def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">; def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32", [(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))] >; @@ -1888,6 +1886,13 @@ def : GCNPat < (S_MOV_B32 (i32 0)), sub1)) >; +def : GCNPat < + (i64 (UniformBinFrag<sub> 64, (UniformUnaryFrag<ctpop> i64:$src))), + (i64 (REG_SEQUENCE SReg_64, + (i32 (COPY_TO_REGCLASS (S_BCNT0_I32_B64 $src), SReg_32)), sub0, + (S_MOV_B32 (i32 0)), sub1)) +>; + def : GCNPat < (i32 (UniformBinFrag<smax> i32:$x, (i32 (ineg i32:$x)))), (S_ABS_I32 SReg_32:$x) diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index a9516057be1ef..af892ecd31ca4 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -465,6 +465,7 @@ define amdgpu_ps i32 @bcnt064(i64 inreg %val0) { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] ; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND @@ -682,19 +683,18 @@ define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspac define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) { ; CHECK-LABEL: bcnt032_ctpop_multiple_uses: ; CHECK: ; %bb.0: -; CHECK-NEXT: _i32_b32 s0, s0 -; CHECK-NEXT: 32 s1, 32, s0 -; CHECK-NEXT: g_u32 s1, 0 -; CHECK-NEXT: TART -; CHECK-NEXT: 0 -; CHECK-NEXT: ND -; CHECK-NEXT: TART -; CHECK-NEXT: 1 -; CHECK-NEXT: ND -; CHECK-NEXT: ct_b64 s[0:1], -1, 0 -; CHECK-NEXT: sk_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: irstlane_b32 s0, v0 -; CHECK-NEXT: n to shader part epilog +; CHECK-NEXT: s_bcnt1_i32_b32 s1, s0 +; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s1 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone %result2 = sub i32 32, %result call void asm "; use $0", "s"(i32 %result) @@ -707,21 +707,21 @@ define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) { define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) { ; CHECK-LABEL: bcnt064_ctpop_multiple_uses: ; CHECK: ; %bb.0: -; CHECK-NEXT: _i32_b64 s0, s[0:1] -; CHECK-NEXT: 32 s2, 64, s0 -; CHECK-NEXT: u32 s3, 0, 0 -; CHECK-NEXT: 32 s1, 0 -; CHECK-NEXT: g_u64 s[2:3], 0 -; CHECK-NEXT: TART -; CHECK-NEXT: [0:1] -; CHECK-NEXT: ND -; CHECK-NEXT: ct_b64 s[0:1], -1, 0 -; CHECK-NEXT: sk_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: irstlane_b32 s0, v0 -; CHECK-NEXT: TART -; CHECK-NEXT: [2:3] -; CHECK-NEXT: ND -; CHECK-NEXT: n to shader part epilog +; CHECK-NEXT: s_mov_b32 s3, 0 +; CHECK-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] +; CHECK-NEXT: s_mov_b32 s1, s3 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[2:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; return to shader part epilog %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone %result2 = sub i64 64, %result call void asm "; use $0", "s"(i64 %result) >From c617ef5582dd64ed0cfd812202f632e3b5007a7e Mon Sep 17 00:00:00 2001 From: Patrick Simmons <[email protected]> Date: Wed, 29 Oct 2025 12:35:46 -0500 Subject: [PATCH 09/11] Review changes --- llvm/lib/Target/AMDGPU/SOPInstructions.td | 4 +- llvm/test/CodeGen/AMDGPU/s_bcnt0.ll | 110 ++++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 108 --------------------- 3 files changed, 112 insertions(+), 110 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/s_bcnt0.ll diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 9e284d8ab7a48..039cd2dd11b8d 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -265,7 +265,7 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64", let Defs = [SCC] in { def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32", - [(set i32:$sdst, (UniformBinFrag<sub> 32, (UniformUnaryFrag<ctpop> i32:$src0)))] + [(set i32:$sdst, (UniformBinFrag<sub> 32, (ctpop i32:$src0)))] >; def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">; def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32", @@ -1887,7 +1887,7 @@ def : GCNPat < >; def : GCNPat < - (i64 (UniformBinFrag<sub> 64, (UniformUnaryFrag<ctpop> i64:$src))), + (i64 (UniformBinFrag<sub> 64, (ctpop i64:$src))), (i64 (REG_SEQUENCE SReg_64, (i32 (COPY_TO_REGCLASS (S_BCNT0_I32_B64 $src), SReg_32)), sub0, (S_MOV_B32 (i32 0)), sub1)) diff --git a/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll b/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll new file mode 100644 index 0000000000000..a73a12ece94f3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll @@ -0,0 +1,110 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s + +define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; CHECK-LABEL: bcnt032_not_for_vregs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshl_b32 s0, s0, 2 +; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; CHECK-NEXT: global_load_dword v2, v[2:3], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0 +; CHECK-NEXT: v_sub_u32_e32 v3, 32, v2 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use v3 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: global_store_dword v[0:1], v2, off +; CHECK-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid + %val0 = load volatile i32, ptr addrspace(1) %gep + %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone + %result2 = sub i32 32, %result + call void asm "; use $0", "s"(i32 %result2) + %cmp = icmp ne i32 %result2, 0 + %zext = zext i1 %cmp to i32 + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; CHECK-LABEL: bcnt064_not_for_vregs: +; CHECK: ; %bb.0: +; CHECK-NEXT: b32 s0, s0, 2 +; CHECK-NEXT: o_u32_e32 v2, vcc, s0, v2 +; CHECK-NEXT: co_u32_e32 v3, vcc, 0, v3, vcc +; CHECK-NEXT: load_dwordx2 v[2:3], v[2:3], off glc +; CHECK-NEXT: nt vmcnt(0) +; CHECK-NEXT: 32_e32 v4, 0 +; CHECK-NEXT: u32_b32 v2, v2, 0 +; CHECK-NEXT: u32_b32 v3, v3, v2 +; CHECK-NEXT: o_u32_e32 v5, vcc, 64, v3 +; CHECK-NEXT: co_u32_e64 v6, s[0:1], 0, 0, vcc +; CHECK-NEXT: TART +; CHECK-NEXT: [5:6] +; CHECK-NEXT: ND +; CHECK-NEXT: store_dwordx2 v[0:1], v[3:4], off +; CHECK-NEXT: m + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid + %val0 = load volatile i64, ptr addrspace(1) %gep + %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone + %result2 = sub i64 64, %result + call void asm "; use $0", "s"(i64 %result2) + %cmp = icmp ne i64 %result2, 0 + %zext = zext i1 %cmp to i32 + store i64 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) { +; CHECK-LABEL: bcnt032_ctpop_multiple_uses: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_bcnt1_i32_b32 s1, s0 +; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s1 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone + %result2 = sub i32 32, %result + call void asm "; use $0", "s"(i32 %result) + call void asm "; use $0", "s"(i32 %result2) + %cmp = icmp ne i32 %result2, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) { +; CHECK-LABEL: bcnt064_ctpop_multiple_uses: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s3, 0 +; CHECK-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] +; CHECK-NEXT: s_mov_b32 s1, s3 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[2:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; return to shader part epilog + %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone + %result2 = sub i64 64, %result + call void asm "; use $0", "s"(i64 %result) + call void asm "; use $0", "s"(i64 %result2) + %cmp = icmp ne i64 %result2, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index af892ecd31ca4..63c00fc2d9c5d 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -621,112 +621,4 @@ if: endif: ret i32 1 -} - -define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { -; CHECK-LABEL: bcnt032_not_for_vregs: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_lshl_b32 s0, s0, 2 -; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; CHECK-NEXT: global_load_dword v2, v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0 -; CHECK-NEXT: v_sub_u32_e32 v3, 32, v2 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v3 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: global_store_dword v[0:1], v2, off -; CHECK-NEXT: s_endpgm - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid - %val0 = load volatile i32, ptr addrspace(1) %gep - %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone - %result2 = sub i32 32, %result - call void asm "; use $0", "s"(i32 %result2) - %cmp = icmp ne i32 %result2, 0 - %zext = zext i1 %cmp to i32 - store i32 %result, ptr addrspace(1) %out - ret void -} - -define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { -; CHECK-LABEL: bcnt064_not_for_vregs: -; CHECK: ; %bb.0: -; CHECK-NEXT: b32 s0, s0, 2 -; CHECK-NEXT: o_u32_e32 v2, vcc, s0, v2 -; CHECK-NEXT: co_u32_e32 v3, vcc, 0, v3, vcc -; CHECK-NEXT: load_dwordx2 v[2:3], v[2:3], off glc -; CHECK-NEXT: nt vmcnt(0) -; CHECK-NEXT: 32_e32 v4, 0 -; CHECK-NEXT: u32_b32 v2, v2, 0 -; CHECK-NEXT: u32_b32 v3, v3, v2 -; CHECK-NEXT: o_u32_e32 v5, vcc, 64, v3 -; CHECK-NEXT: co_u32_e64 v6, s[0:1], 0, 0, vcc -; CHECK-NEXT: TART -; CHECK-NEXT: [5:6] -; CHECK-NEXT: ND -; CHECK-NEXT: store_dwordx2 v[0:1], v[3:4], off -; CHECK-NEXT: m - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid - %val0 = load volatile i64, ptr addrspace(1) %gep - %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone - %result2 = sub i64 64, %result - call void asm "; use $0", "s"(i64 %result2) - %cmp = icmp ne i64 %result2, 0 - %zext = zext i1 %cmp to i32 - store i64 %result, ptr addrspace(1) %out - ret void -} - -define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) { -; CHECK-LABEL: bcnt032_ctpop_multiple_uses: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_bcnt1_i32_b32 s1, s0 -; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s1 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ; return to shader part epilog - %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone - %result2 = sub i32 32, %result - call void asm "; use $0", "s"(i32 %result) - call void asm "; use $0", "s"(i32 %result2) - %cmp = icmp ne i32 %result2, 0 - %zext = zext i1 %cmp to i32 - ret i32 %zext -} - -define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) { -; CHECK-LABEL: bcnt064_ctpop_multiple_uses: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_mov_b32 s3, 0 -; CHECK-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] -; CHECK-NEXT: s_mov_b32 s1, s3 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[2:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; return to shader part epilog - %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone - %result2 = sub i64 64, %result - call void asm "; use $0", "s"(i64 %result) - call void asm "; use $0", "s"(i64 %result2) - %cmp = icmp ne i64 %result2, 0 - %zext = zext i1 %cmp to i32 - ret i32 %zext } \ No newline at end of file >From f5776e117b50e7f69470fa677c5d2d55f272dd9f Mon Sep 17 00:00:00 2001 From: Patrick Simmons <[email protected]> Date: Wed, 29 Oct 2025 12:37:29 -0500 Subject: [PATCH 10/11] Newline --- llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index 63c00fc2d9c5d..1a7fb38f12473 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -621,4 +621,4 @@ if: endif: ret i32 1 -} \ No newline at end of file +} >From d4d342847cbf03cd186260cbc07ad016f2ec8386 Mon Sep 17 00:00:00 2001 From: Patrick Simmons <[email protected]> Date: Fri, 31 Oct 2025 16:10:03 -0500 Subject: [PATCH 11/11] Fix testcases --- llvm/test/CodeGen/AMDGPU/s_bcnt0.ll | 114 ++++++++++++++-------------- llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 36 ++++----- 2 files changed, 75 insertions(+), 75 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll b/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll index a73a12ece94f3..d26d12d821026 100644 --- a/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll @@ -3,19 +3,19 @@ define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; CHECK-LABEL: bcnt032_not_for_vregs: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_lshl_b32 s0, s0, 2 -; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; CHECK-NEXT: global_load_dword v2, v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0 -; CHECK-NEXT: v_sub_u32_e32 v3, 32, v2 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v3 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: global_store_dword v[0:1], v2, off -; CHECK-NEXT: s_endpgm +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshl_b32 s0, s0, 2 +; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; CHECK-NEXT: global_load_dword v2, v[2:3], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0 +; CHECK-NEXT: v_sub_u32_e32 v3, 32, v2 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use v3 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: global_store_dword v[0:1], v2, off +; CHECK-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid %val0 = load volatile i32, ptr addrspace(1) %gep @@ -30,22 +30,22 @@ define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspac define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; CHECK-LABEL: bcnt064_not_for_vregs: -; CHECK: ; %bb.0: -; CHECK-NEXT: b32 s0, s0, 2 -; CHECK-NEXT: o_u32_e32 v2, vcc, s0, v2 -; CHECK-NEXT: co_u32_e32 v3, vcc, 0, v3, vcc -; CHECK-NEXT: load_dwordx2 v[2:3], v[2:3], off glc -; CHECK-NEXT: nt vmcnt(0) -; CHECK-NEXT: 32_e32 v4, 0 -; CHECK-NEXT: u32_b32 v2, v2, 0 -; CHECK-NEXT: u32_b32 v3, v3, v2 -; CHECK-NEXT: o_u32_e32 v5, vcc, 64, v3 -; CHECK-NEXT: co_u32_e64 v6, s[0:1], 0, 0, vcc -; CHECK-NEXT: TART -; CHECK-NEXT: [5:6] -; CHECK-NEXT: ND -; CHECK-NEXT: store_dwordx2 v[0:1], v[3:4], off -; CHECK-NEXT: m +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshl_b32 s0, s0, 2 +; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0 +; CHECK-NEXT: v_bcnt_u32_b32 v3, v3, v2 +; CHECK-NEXT: v_sub_co_u32_e32 v5, vcc, 64, v3 +; CHECK-NEXT: v_subb_co_u32_e64 v6, s[0:1], 0, 0, vcc +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use v[5:6] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[3:4], off +; CHECK-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid %val0 = load volatile i64, ptr addrspace(1) %gep @@ -61,18 +61,18 @@ define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspac define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) { ; CHECK-LABEL: bcnt032_ctpop_multiple_uses: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_bcnt1_i32_b32 s1, s0 -; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s1 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ; return to shader part epilog +; CHECK-NEXT: s_bcnt1_i32_b32 s1, s0 +; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s1 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone %result2 = sub i32 32, %result call void asm "; use $0", "s"(i32 %result) @@ -85,21 +85,21 @@ define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) { define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) { ; CHECK-LABEL: bcnt064_ctpop_multiple_uses: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_mov_b32 s3, 0 -; CHECK-NEXT: s_bcnt1_i32_b64 s2, s[0:1] -; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] -; CHECK-NEXT: s_mov_b32 s1, s3 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[2:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; return to shader part epilog +; CHECK-NEXT: s_mov_b32 s3, 0 +; CHECK-NEXT: s_bcnt1_i32_b64 s2, s[0:1] +; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] +; CHECK-NEXT: s_mov_b32 s1, s3 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[2:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; return to shader part epilog %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone %result2 = sub i64 64, %result call void asm "; use $0", "s"(i64 %result) @@ -107,4 +107,4 @@ define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) { %cmp = icmp ne i64 %result2, 0 %zext = zext i1 %cmp to i32 ret i32 %zext -} \ No newline at end of file +} diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index 1a7fb38f12473..e5823d339384d 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -444,14 +444,14 @@ define amdgpu_ps i32 @bfe_u64(i64 inreg %val0) { define amdgpu_ps i32 @bcnt032(i32 inreg %val0) { ; CHECK-LABEL: bcnt032: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ; return to shader part epilog +; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone %result2 = sub i32 32, %result call void asm "; use $0", "s"(i32 %result2) @@ -463,16 +463,16 @@ define amdgpu_ps i32 @bcnt032(i32 inreg %val0) { define amdgpu_ps i32 @bcnt064(i64 inreg %val0) { ; CHECK-LABEL: bcnt064: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] -; CHECK-NEXT: s_mov_b32 s1, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ; return to shader part epilog +; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone %result2 = sub i64 64, %result call void asm "; use $0", "s"(i64 %result2) _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
