https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/134111
>From 46de785e801bf8ca87e01aee9ad0a13ac07a47d6 Mon Sep 17 00:00:00 2001 From: Alex Maclean <amacl...@nvidia.com> Date: Tue, 1 Apr 2025 20:22:24 +0000 Subject: [PATCH] [NVPTX] Auto-Upgrade llvm.nvvm.atomic.load.{inc,dec}.32 --- clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp | 18 ++----- clang/test/CodeGen/builtins-nvptx.c | 4 +- llvm/include/llvm/IR/IntrinsicsNVVM.td | 10 +--- .../include/llvm/Target/TargetSelectionDAG.td | 2 + llvm/lib/IR/AutoUpgrade.cpp | 9 ++++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 15 ++++-- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 4 +- .../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 52 +++++++++---------- .../Assembler/auto_upgrade_nvvm_intrinsics.ll | 16 +++++- llvm/test/CodeGen/NVPTX/atomics.ll | 36 ++++++++++++- 10 files changed, 107 insertions(+), 59 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp index aaac19b229905..0f7ab9fd3b099 100644 --- a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp @@ -481,21 +481,11 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, AtomicOrdering::SequentiallyConsistent); } - case NVPTX::BI__nvvm_atom_inc_gen_ui: { - Value *Ptr = EmitScalarExpr(E->getArg(0)); - Value *Val = EmitScalarExpr(E->getArg(1)); - Function *FnALI32 = - CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_inc_32, Ptr->getType()); - return Builder.CreateCall(FnALI32, {Ptr, Val}); - } + case NVPTX::BI__nvvm_atom_inc_gen_ui: + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UIncWrap, E); - case NVPTX::BI__nvvm_atom_dec_gen_ui: { - Value *Ptr = EmitScalarExpr(E->getArg(0)); - Value *Val = EmitScalarExpr(E->getArg(1)); - Function *FnALD32 = - CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_dec_32, Ptr->getType()); - return Builder.CreateCall(FnALD32, {Ptr, Val}); - } + case NVPTX::BI__nvvm_atom_dec_gen_ui: + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UDecWrap, E); case NVPTX::BI__nvvm_ldg_c: case NVPTX::BI__nvvm_ldg_sc: diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index ffa41c85c2734..71b29849618b6 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -333,10 +333,10 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df, // CHECK: atomicrmw fadd ptr {{.*}} seq_cst, align 4 __nvvm_atom_add_gen_f(fp, f); - // CHECK: call i32 @llvm.nvvm.atomic.load.inc.32.p0 + // CHECK: atomicrmw uinc_wrap ptr {{.*}} seq_cst, align 4 __nvvm_atom_inc_gen_ui(uip, ui); - // CHECK: call i32 @llvm.nvvm.atomic.load.dec.32.p0 + // CHECK: atomicrmw udec_wrap ptr {{.*}} seq_cst, align 4 __nvvm_atom_dec_gen_ui(uip, ui); diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 3e9588a515c9e..4aeb1d8a2779e 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -124,6 +124,8 @@ // * llvm.nvvm.ldg.global.f --> ibid. // * llvm.nvvm.ldg.global.p --> ibid. // * llvm.nvvm.swap.lo.hi.b64 --> llvm.fshl(x, x, 32) +// * llvm.nvvm.atomic.load.inc.32 --> atomicrmw uinc_wrap +// * llvm.nvvm.atomic.load.dec.32 --> atomicrmw udec_wrap def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr @@ -1633,14 +1635,6 @@ let TargetPrefix = "nvvm" in { DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; -// Atomics not available as llvm intrinsics. - def int_nvvm_atomic_load_inc_32 : Intrinsic<[llvm_i32_ty], - [llvm_anyptr_ty, llvm_i32_ty], - [IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>]>; - def int_nvvm_atomic_load_dec_32 : Intrinsic<[llvm_i32_ty], - [llvm_anyptr_ty, llvm_i32_ty], - [IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>]>; - class SCOPED_ATOMIC2_impl<LLVMType elty> : Intrinsic<[elty], [llvm_anyptr_ty, LLVMMatchType<0>], diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 42a5fbec95174..9c241b6c4df0f 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -1825,6 +1825,8 @@ defm atomic_load_min : binary_atomic_op<atomic_load_min>; defm atomic_load_max : binary_atomic_op<atomic_load_max>; defm atomic_load_umin : binary_atomic_op<atomic_load_umin>; defm atomic_load_umax : binary_atomic_op<atomic_load_umax>; +defm atomic_load_uinc_wrap : binary_atomic_op<atomic_load_uinc_wrap>; +defm atomic_load_udec_wrap : binary_atomic_op<atomic_load_udec_wrap>; defm atomic_cmp_swap : ternary_atomic_op<atomic_cmp_swap>; /// Atomic load which zeroes the excess high bits. diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 963fb1b6ad8c0..0b329d91c3c7c 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1302,6 +1302,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, else if (Name.consume_front("atomic.load.add.")) // nvvm.atomic.load.add.{f32.p,f64.p} Expand = Name.starts_with("f32.p") || Name.starts_with("f64.p"); + else if (Name.consume_front("atomic.load.") && Name.consume_back(".32")) + // nvvm.atomic.load.{inc,dec}.32 + Expand = Name == "inc" || Name == "dec"; else if (Name.consume_front("bitcast.")) // nvvm.bitcast.{f2i,i2f,ll2d,d2ll} Expand = @@ -2314,6 +2317,12 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, Value *Val = CI->getArgOperand(1); Rep = Builder.CreateAtomicRMW(AtomicRMWInst::FAdd, Ptr, Val, MaybeAlign(), AtomicOrdering::SequentiallyConsistent); + } else if (Name.consume_front("atomic.load.") && Name.consume_back(".32")) { + Value *Ptr = CI->getArgOperand(0); + Value *Val = CI->getArgOperand(1); + auto Op = Name == "inc" ? AtomicRMWInst::UIncWrap : AtomicRMWInst::UDecWrap; + Rep = Builder.CreateAtomicRMW(Op, Ptr, Val, MaybeAlign(), + AtomicOrdering::SequentiallyConsistent); } else if (Name.consume_front("max.") && (Name == "s" || Name == "i" || Name == "ll" || Name == "us" || Name == "ui" || Name == "ull")) { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index b566cdd4b6bfc..904890b01596d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -4067,9 +4067,6 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( return true; } - case Intrinsic::nvvm_atomic_load_inc_32: - case Intrinsic::nvvm_atomic_load_dec_32: - case Intrinsic::nvvm_atomic_add_gen_f_cta: case Intrinsic::nvvm_atomic_add_gen_f_sys: case Intrinsic::nvvm_atomic_add_gen_i_cta: @@ -6145,6 +6142,18 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { default: llvm_unreachable("unsupported width encountered"); } + case AtomicRMWInst::BinOp::UIncWrap: + case AtomicRMWInst::BinOp::UDecWrap: + switch (ITy->getBitWidth()) { + case 32: + return AtomicExpansionKind::None; + case 8: + case 16: + case 64: + return AtomicExpansionKind::CmpXChg; + default: + llvm_unreachable("unsupported width encountered"); + } } return AtomicExpansionKind::CmpXChg; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 34cb63e44ca71..8528ff702f236 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2070,8 +2070,8 @@ defm INT_PTX_ATOMIC_UMIN_32 : F_ATOMIC_2_AS<I32RT, atomic_load_umin_i32, "min.u3 defm INT_PTX_ATOMIC_UMIN_64 : F_ATOMIC_2_AS<I64RT, atomic_load_umin_i64, "min.u64", [hasSM<32>]>; // atom_inc atom_dec -defm INT_PTX_ATOM_INC_32 : F_ATOMIC_2_AS<I32RT, int_nvvm_atomic_load_inc_32, "inc.u32">; -defm INT_PTX_ATOM_DEC_32 : F_ATOMIC_2_AS<I32RT, int_nvvm_atomic_load_dec_32, "dec.u32">; +defm INT_PTX_ATOM_INC_32 : F_ATOMIC_2_AS<I32RT, atomic_load_uinc_wrap_i32, "inc.u32">; +defm INT_PTX_ATOM_DEC_32 : F_ATOMIC_2_AS<I32RT, atomic_load_udec_wrap_i32, "dec.u32">; // atom_and defm INT_PTX_ATOM_AND_32 : F_ATOMIC_2_AS<I32RT, atomic_load_and_i32, "and.b32">; diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index e359735c20750..81ad01bea8867 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -46,33 +46,31 @@ static bool readsLaneId(const IntrinsicInst *II) { // Whether the given intrinsic is an atomic instruction in PTX. static bool isNVVMAtomic(const IntrinsicInst *II) { switch (II->getIntrinsicID()) { - default: return false; - case Intrinsic::nvvm_atomic_load_inc_32: - case Intrinsic::nvvm_atomic_load_dec_32: - - case Intrinsic::nvvm_atomic_add_gen_f_cta: - case Intrinsic::nvvm_atomic_add_gen_f_sys: - case Intrinsic::nvvm_atomic_add_gen_i_cta: - case Intrinsic::nvvm_atomic_add_gen_i_sys: - case Intrinsic::nvvm_atomic_and_gen_i_cta: - case Intrinsic::nvvm_atomic_and_gen_i_sys: - case Intrinsic::nvvm_atomic_cas_gen_i_cta: - case Intrinsic::nvvm_atomic_cas_gen_i_sys: - case Intrinsic::nvvm_atomic_dec_gen_i_cta: - case Intrinsic::nvvm_atomic_dec_gen_i_sys: - case Intrinsic::nvvm_atomic_inc_gen_i_cta: - case Intrinsic::nvvm_atomic_inc_gen_i_sys: - case Intrinsic::nvvm_atomic_max_gen_i_cta: - case Intrinsic::nvvm_atomic_max_gen_i_sys: - case Intrinsic::nvvm_atomic_min_gen_i_cta: - case Intrinsic::nvvm_atomic_min_gen_i_sys: - case Intrinsic::nvvm_atomic_or_gen_i_cta: - case Intrinsic::nvvm_atomic_or_gen_i_sys: - case Intrinsic::nvvm_atomic_exch_gen_i_cta: - case Intrinsic::nvvm_atomic_exch_gen_i_sys: - case Intrinsic::nvvm_atomic_xor_gen_i_cta: - case Intrinsic::nvvm_atomic_xor_gen_i_sys: - return true; + default: + return false; + case Intrinsic::nvvm_atomic_add_gen_f_cta: + case Intrinsic::nvvm_atomic_add_gen_f_sys: + case Intrinsic::nvvm_atomic_add_gen_i_cta: + case Intrinsic::nvvm_atomic_add_gen_i_sys: + case Intrinsic::nvvm_atomic_and_gen_i_cta: + case Intrinsic::nvvm_atomic_and_gen_i_sys: + case Intrinsic::nvvm_atomic_cas_gen_i_cta: + case Intrinsic::nvvm_atomic_cas_gen_i_sys: + case Intrinsic::nvvm_atomic_dec_gen_i_cta: + case Intrinsic::nvvm_atomic_dec_gen_i_sys: + case Intrinsic::nvvm_atomic_inc_gen_i_cta: + case Intrinsic::nvvm_atomic_inc_gen_i_sys: + case Intrinsic::nvvm_atomic_max_gen_i_cta: + case Intrinsic::nvvm_atomic_max_gen_i_sys: + case Intrinsic::nvvm_atomic_min_gen_i_cta: + case Intrinsic::nvvm_atomic_min_gen_i_sys: + case Intrinsic::nvvm_atomic_or_gen_i_cta: + case Intrinsic::nvvm_atomic_or_gen_i_sys: + case Intrinsic::nvvm_atomic_exch_gen_i_cta: + case Intrinsic::nvvm_atomic_exch_gen_i_sys: + case Intrinsic::nvvm_atomic_xor_gen_i_cta: + case Intrinsic::nvvm_atomic_xor_gen_i_sys: + return true; } } diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll index 588e79a7428a4..74b9640df6977 100644 --- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll +++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll @@ -52,6 +52,9 @@ declare i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr, i32) declare ptr @llvm.nvvm.ldg.global.p.p0(ptr, i32) declare float @llvm.nvvm.ldg.global.f.f32.p0(ptr, i32) +declare i32 @llvm.nvvm.atomic.load.inc.32(ptr, i32) +declare i32 @llvm.nvvm.atomic.load.dec.32(ptr, i32) + ; CHECK-LABEL: @simple_upgrade define void @simple_upgrade(i32 %a, i64 %b, i16 %c) { ; CHECK: call i32 @llvm.bitreverse.i32(i32 %a) @@ -224,4 +227,15 @@ define void @ldg(ptr %p0, ptr addrspace(1) %p1) { %v6 = call float @llvm.nvvm.ldg.global.f.f32.p0(ptr %p0, i32 16) ret void -} \ No newline at end of file +} + +; CHECK-LABEL: @atomics +define i32 @atomics(ptr %p0, i32 %a) { +; CHECK: %1 = atomicrmw uinc_wrap ptr %p0, i32 %a seq_cst +; CHECK: %2 = atomicrmw udec_wrap ptr %p0, i32 %a seq_cst + + %r1 = call i32 @llvm.nvvm.atomic.load.inc.32(ptr %p0, i32 %a) + %r2 = call i32 @llvm.nvvm.atomic.load.dec.32(ptr %p0, i32 %a) + ret i32 %r2 +} + diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll index e1d9aaf7cfb20..bb04aa856d656 100644 --- a/llvm/test/CodeGen/NVPTX/atomics.ll +++ b/llvm/test/CodeGen/NVPTX/atomics.ll @@ -313,6 +313,38 @@ define i64 @atom19(ptr %subr, i64 %val) { ret i64 %ret } +define i32 @atom20(ptr %subr, i32 %val) { +; CHECK-LABEL: atom20( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom20_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom20_param_1]; +; CHECK-NEXT: atom.inc.u32 %r2, [%rd1], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %ret = atomicrmw uinc_wrap ptr %subr, i32 %val seq_cst + ret i32 %ret +} + +define i32 @atom21(ptr %subr, i32 %val) { +; CHECK-LABEL: atom21( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [atom21_param_0]; +; CHECK-NEXT: ld.param.u32 %r1, [atom21_param_1]; +; CHECK-NEXT: atom.dec.u32 %r2, [%rd1], %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %ret = atomicrmw udec_wrap ptr %subr, i32 %val seq_cst + ret i32 %ret +} + declare float @llvm.nvvm.atomic.load.add.f32.p0(ptr %addr, float %val) ; CHECK-LABEL: atomic_add_f32_generic @@ -409,7 +441,7 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) { ; CHECK-NEXT: not.b32 %r2, %r9; ; CHECK-NEXT: ld.u32 %r16, [%rd1]; ; CHECK-NEXT: cvt.f32.f16 %f2, %rs1; -; CHECK-NEXT: $L__BB22_1: // %atomicrmw.start +; CHECK-NEXT: $L__BB24_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: shr.u32 %r10, %r16, %r1; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r10; @@ -424,7 +456,7 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) { ; CHECK-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r14; ; CHECK-NEXT: setp.ne.s32 %p1, %r5, %r16; ; CHECK-NEXT: mov.b32 %r16, %r5; -; CHECK-NEXT: @%p1 bra $L__BB22_1; +; CHECK-NEXT: @%p1 bra $L__BB24_1; ; CHECK-NEXT: // %bb.2: // %atomicrmw.end ; CHECK-NEXT: shr.u32 %r15, %r5, %r1; ; CHECK-NEXT: cvt.u16.u32 %rs4, %r15; _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits