https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/109410
>From 834ff3b40bd82cb54bb33532a54ad36870ea2b24 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Wed, 14 Aug 2024 13:57:14 +0400 Subject: [PATCH 1/2] AMDGPU: Custom expand flat cmpxchg which may access private 64-bit flat cmpxchg instructions do not work correctly for scratch addresses, and need to be expanded as non-atomic. Allow custom expansion of cmpxchg in AtomicExpand, as is already the case for atomicrmw. --- llvm/include/llvm/CodeGen/TargetLowering.h | 5 + .../llvm/Transforms/Utils/LowerAtomic.h | 7 + llvm/lib/CodeGen/AtomicExpandPass.cpp | 4 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 146 ++- llvm/lib/Target/AMDGPU/SIISelLowering.h | 3 + llvm/lib/Transforms/Utils/LowerAtomic.cpp | 21 +- llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll | 1019 +++++++++++++++-- ...expand-atomicrmw-flat-noalias-addrspace.ll | 6 +- ...expand-atomicrmw-integer-ops-0-to-add-0.ll | 6 +- .../expand-cmpxchg-flat-maybe-private.ll | 104 +- 10 files changed, 1157 insertions(+), 164 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 8e0cdc6f1a5e77..e0b638201a0474 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2204,6 +2204,11 @@ class TargetLoweringBase { "Generic atomicrmw expansion unimplemented on this target"); } + /// Perform a cmpxchg expansion using a target-specific method. + virtual void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const { + llvm_unreachable("Generic cmpxchg expansion unimplemented on this target"); + } + /// Perform a bit test atomicrmw using a target-specific intrinsic. This /// represents the combined bit test intrinsic which will be lowered at a late /// stage by the backend. diff --git a/llvm/include/llvm/Transforms/Utils/LowerAtomic.h b/llvm/include/llvm/Transforms/Utils/LowerAtomic.h index b25b281667f9cb..295c2bd2b4b47e 100644 --- a/llvm/include/llvm/Transforms/Utils/LowerAtomic.h +++ b/llvm/include/llvm/Transforms/Utils/LowerAtomic.h @@ -23,6 +23,13 @@ class IRBuilderBase; /// Convert the given Cmpxchg into primitive load and compare. bool lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI); +/// Emit IR to implement the given cmpxchg operation on values in registers, +/// returning the new value. +std::pair<Value *, Value *> buildAtomicCmpXchgValue(IRBuilderBase &Builder, + Value *Ptr, Value *Cmp, + Value *Val, + Align Alignment); + /// Convert the given RMWI into primitive load and stores, /// assuming that doing so is legal. Return true if the lowering /// succeeds. diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 0aff4f1f5cf1cb..1471e3d7cbc29d 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -1674,6 +1674,10 @@ bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) { return true; case TargetLoweringBase::AtomicExpansionKind::NotAtomic: return lowerAtomicCmpXchgInst(CI); + case TargetLoweringBase::AtomicExpansionKind::Expand: { + TLI->emitExpandAtomicCmpXchg(CI); + return true; + } } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index d66610ae0a160d..c8a46875bda408 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16577,9 +16577,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const { - return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS - ? AtomicExpansionKind::NotAtomic - : AtomicExpansionKind::None; + unsigned AddrSpace = CmpX->getPointerAddressSpace(); + if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) + return AtomicExpansionKind::NotAtomic; + + if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX)) + return AtomicExpansionKind::None; + + const DataLayout &DL = CmpX->getDataLayout(); + + Type *ValTy = CmpX->getNewValOperand()->getType(); + + // If a 64-bit flat atomic may alias private, we need to avoid using the + // atomic in the private case. + return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand + : AtomicExpansionKind::None; } const TargetRegisterClass * @@ -16745,40 +16757,8 @@ bool SITargetLowering::checkForPhysRegDependency( return false; } -void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { - AtomicRMWInst::BinOp Op = AI->getOperation(); - - if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or || - Op == AtomicRMWInst::Xor) { - if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand()); - ConstVal && ConstVal->isNullValue()) { - // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0 - AI->setOperation(AtomicRMWInst::Add); - - // TODO: Turn the below private handling into a no-op for idempotent - // cases. - } - } - - // The non-flat expansions should only perform the de-canonicalization of - // identity values. - if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS) - return; - - // FullFlatEmulation is true if we need to issue the private, shared, and - // global cases. - // - // If this is false, we are only dealing with the flat-targeting-private case, - // where we only insert a check for private and still use the flat instruction - // for global and shared. - - // TODO: Avoid the private check for the fadd case depending on - // noalias.addrspace. - - bool FullFlatEmulation = Op == AtomicRMWInst::FAdd && - Subtarget->hasAtomicFaddInsts() && - AI->getType()->isFloatTy(); - +void SITargetLowering::emitExpandAtomicAddrSpacePredicate( + Instruction *AI) const { // Given: atomicrmw fadd ptr %addr, float %val ordering // // With this expansion we produce the following code: @@ -16825,6 +16805,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { IRBuilder<> Builder(AI); LLVMContext &Ctx = Builder.getContext(); + auto *RMW = dyn_cast<AtomicRMWInst>(AI); + const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex() + : AtomicCmpXchgInst::getPointerOperandIndex(); + Value *Addr = AI->getOperand(PtrOpIdx); + + /// TODO: Only need to check private, then emit flat-known-not private (no + /// need for shared block, or cast to global). + AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI); + + Align Alignment; + if (RMW) + Alignment = RMW->getAlign(); + else if (CX) + Alignment = CX->getAlign(); + else + llvm_unreachable("unhandled atomic operation"); + + // FullFlatEmulation is true if we need to issue the private, shared, and + // global cases. + // + // If this is false, we are only dealing with the flat-targeting-private case, + // where we only insert a check for private and still use the flat instruction + // for global and shared. + + bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd && + Subtarget->hasAtomicFaddInsts() && + RMW->getType()->isFloatTy(); + // If the return value isn't used, do not introduce a false use in the phi. bool ReturnValueIsUsed = !AI->use_empty(); @@ -16846,11 +16854,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB); BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB); - Value *Val = AI->getValOperand(); - Type *ValTy = Val->getType(); - Value *Addr = AI->getPointerOperand(); - Align Alignment = AI->getAlign(); - std::prev(BB->end())->eraseFromParent(); Builder.SetInsertPoint(BB); @@ -16865,8 +16868,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { Instruction *Clone = AI->clone(); Clone->insertInto(SharedBB, SharedBB->end()); - Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex()) - .set(CastToLocal); + Clone->getOperandUse(PtrOpIdx).set(CastToLocal); LoadedShared = Clone; Builder.CreateBr(PhiBB); @@ -16878,14 +16880,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB); Builder.SetInsertPoint(PrivateBB); + Value *CastToPrivate = Builder.CreateAddrSpaceCast( Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS)); - Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate, - Alignment, "loaded.private"); - Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val); + Value *LoadedPrivate; + if (RMW) { + LoadedPrivate = Builder.CreateAlignedLoad( + RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private"); + + Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder, + LoadedPrivate, RMW->getValOperand()); + + Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign()); + } else { + auto [ResultLoad, Equal] = + buildAtomicCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(), + CX->getNewValOperand(), CX->getAlign()); + + Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()), + ResultLoad, 0); + LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1); + } - Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment); Builder.CreateBr(PhiBB); Builder.SetInsertPoint(GlobalBB); @@ -16895,8 +16912,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { if (FullFlatEmulation) { Value *CastToGlobal = Builder.CreateAddrSpaceCast( Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS)); - AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()) - .set(CastToGlobal); + AI->getOperandUse(PtrOpIdx).set(CastToGlobal); } AI->removeFromParent(); @@ -16920,7 +16936,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { Builder.SetInsertPoint(PhiBB); if (ReturnValueIsUsed) { - PHINode *Loaded = Builder.CreatePHI(ValTy, 3); + PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3); AI->replaceAllUsesWith(Loaded); if (FullFlatEmulation) Loaded->addIncoming(LoadedShared, SharedBB); @@ -16932,6 +16948,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { Builder.CreateBr(ExitBB); } +void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { + AtomicRMWInst::BinOp Op = AI->getOperation(); + + if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or || + Op == AtomicRMWInst::Xor) { + if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand()); + ConstVal && ConstVal->isNullValue()) { + // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0 + AI->setOperation(AtomicRMWInst::Add); + + // We may still need the private-alias-flat handling below. + + // TODO: Skip this for cases where we cannot access remote memory. + } + } + + // The non-flat expansions should only perform the de-canonicalization of + // identity values. + if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS) + return; + + emitExpandAtomicAddrSpacePredicate(AI); +} + +void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const { + emitExpandAtomicAddrSpacePredicate(CI); +} + LoadInst * SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { IRBuilder<> Builder(AI); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 6c3edf37945e24..32e110fdfa84d4 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -544,7 +544,10 @@ class SITargetLowering final : public AMDGPUTargetLowering { AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override; AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; + + void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const; void emitExpandAtomicRMW(AtomicRMWInst *AI) const override; + void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override; LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; diff --git a/llvm/lib/Transforms/Utils/LowerAtomic.cpp b/llvm/lib/Transforms/Utils/LowerAtomic.cpp index 8b3a0ce338e577..89c49d4a0732f6 100644 --- a/llvm/lib/Transforms/Utils/LowerAtomic.cpp +++ b/llvm/lib/Transforms/Utils/LowerAtomic.cpp @@ -25,13 +25,11 @@ bool llvm::lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { Value *Cmp = CXI->getCompareOperand(); Value *Val = CXI->getNewValOperand(); - LoadInst *Orig = - Builder.CreateAlignedLoad(Val->getType(), Ptr, CXI->getAlign()); - Value *Equal = Builder.CreateICmpEQ(Orig, Cmp); - Value *Res = Builder.CreateSelect(Equal, Val, Orig); - Builder.CreateAlignedStore(Res, Ptr, CXI->getAlign()); + auto [Orig, Equal] = + buildAtomicCmpXchgValue(Builder, Ptr, Cmp, Val, CXI->getAlign()); - Res = Builder.CreateInsertValue(PoisonValue::get(CXI->getType()), Orig, 0); + Value *Res = + Builder.CreateInsertValue(PoisonValue::get(CXI->getType()), Orig, 0); Res = Builder.CreateInsertValue(Res, Equal, 1); CXI->replaceAllUsesWith(Res); @@ -39,6 +37,17 @@ bool llvm::lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { return true; } +std::pair<Value *, Value *> +llvm::buildAtomicCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, + Value *Val, Align Alignment) { + LoadInst *Orig = Builder.CreateAlignedLoad(Val->getType(), Ptr, Alignment); + Value *Equal = Builder.CreateICmpEQ(Orig, Cmp); + Value *Res = Builder.CreateSelect(Equal, Val, Orig); + Builder.CreateAlignedStore(Res, Ptr, Alignment); + + return {Orig, Equal}; +} + Value *llvm::buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val) { diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index 839f4a18508e5b..f2959dc19ba4c0 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -12232,11 +12232,29 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_offset: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s2, s4, 32 ; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB90_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB90_4 +; GCN1-NEXT: .LBB90_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB90_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v5, s3 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 @@ -12246,15 +12264,51 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB90_2 +; GCN1-NEXT: .LBB90_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i64_offset: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s2, s4, 32 ; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB90_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB90_4 +; GCN2-NEXT: .LBB90_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB90_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s3 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 @@ -12264,6 +12318,23 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB90_2 +; GCN2-NEXT: .LBB90_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpxchg_i64_offset: @@ -12271,13 +12342,38 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB90_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB90_4 +; GFX12-NEXT: .LBB90_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB90_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB90_2 +; GFX12-NEXT: .LBB90_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s7, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s6, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s2 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -12288,11 +12384,29 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_soffset: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s2, s4, 0x11940 ; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB91_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB91_4 +; GCN1-NEXT: .LBB91_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB91_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v5, s3 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 @@ -12302,15 +12416,51 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB91_2 +; GCN1-NEXT: .LBB91_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i64_soffset: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s2, s4, 0x11940 ; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB91_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB91_4 +; GCN2-NEXT: .LBB91_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB91_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s3 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 @@ -12320,6 +12470,23 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB91_2 +; GCN2-NEXT: .LBB91_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpxchg_i64_soffset: @@ -12327,13 +12494,38 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 0x11940 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB91_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB91_4 +; GFX12-NEXT: .LBB91_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB91_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000 scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB91_2 +; GFX12-NEXT: .LBB91_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s7, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s6, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s2 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 9000 @@ -12344,54 +12536,144 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB92_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB92_3 +; GCN1-NEXT: s_branch .LBB92_4 +; GCN1-NEXT: .LBB92_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB92_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB92_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB92_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s9 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB92_3 +; GCN2-NEXT: s_branch .LBB92_4 +; GCN2-NEXT: .LBB92_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB92_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB92_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s8, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX12-NEXT: s_cbranch_vccz .LBB92_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB92_3 +; GFX12-NEXT: s_branch .LBB92_4 +; GFX12-NEXT: .LBB92_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB92_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v3, v1, s5, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v0, s4, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB92_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -12406,42 +12688,113 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[8:9], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB93_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB93_4 +; GCN1-NEXT: .LBB93_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB93_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB93_2 +; GCN1-NEXT: .LBB93_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[8:9], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB93_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB93_4 +; GCN2-NEXT: .LBB93_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB93_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB93_2 +; GCN2-NEXT: .LBB93_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset: @@ -12449,13 +12802,39 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB93_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB93_4 +; GFX12-NEXT: .LBB93_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB93_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB93_2 +; GFX12-NEXT: .LBB93_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -12467,23 +12846,57 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s9 ; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 +; GCN1-NEXT: s_load_dword s12, s[2:3], 0x43 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: s_add_u32 s0, s4, s2 +; GCN1-NEXT: s_add_u32 s2, s4, s2 ; GCN1-NEXT: s_addc_u32 s3, s5, s3 -; GCN1-NEXT: s_add_u32 s2, s0, 32 +; GCN1-NEXT: s_add_u32 s2, s2, 32 ; GCN1-NEXT: s_addc_u32 s3, s3, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s12 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB94_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v5, s3 ; GCN1-NEXT: v_mov_b32_e32 v0, s8 ; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v4, s2 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB94_3 +; GCN1-NEXT: s_branch .LBB94_4 +; GCN1-NEXT: .LBB94_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB94_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen +; GCN1-NEXT: .LBB94_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -12491,23 +12904,56 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 ; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GCN2-NEXT: s_load_dword s12, s[2:3], 0x10c +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: s_add_u32 s0, s4, s2 +; GCN2-NEXT: s_add_u32 s2, s4, s2 ; GCN2-NEXT: s_addc_u32 s3, s5, s3 -; GCN2-NEXT: s_add_u32 s2, s0, 32 +; GCN2-NEXT: s_add_u32 s2, s2, 32 ; GCN2-NEXT: s_addc_u32 s3, s3, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s12 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB94_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s3 ; GCN2-NEXT: v_mov_b32_e32 v0, s8 ; GCN2-NEXT: v_mov_b32_e32 v1, s9 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v4, s2 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB94_3 +; GCN2-NEXT: s_branch .LBB94_4 +; GCN2-NEXT: .LBB94_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB94_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB94_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -12519,14 +12965,38 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 +; GFX12-NEXT: s_lshl_b64 s[10:11], s[10:11], 3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[10:11] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 32 +; GFX12-NEXT: s_cmp_eq_u32 s3, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB94_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 -; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB94_3 +; GFX12-NEXT: s_branch .LBB94_4 +; GFX12-NEXT: .LBB94_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB94_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v3, v1, s9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v0, s8, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB94_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -12542,34 +13012,105 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB95_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB95_4 +; GCN1-NEXT: .LBB95_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB95_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v4, s4 -; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB95_2 +; GCN1-NEXT: .LBB95_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i64: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB95_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB95_4 +; GCN2-NEXT: .LBB95_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB95_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v4, s4 -; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB95_2 +; GCN2-NEXT: .LBB95_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpxchg_i64: @@ -12577,13 +13118,36 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_mov_b32 s2, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB95_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB95_4 +; GFX12-NEXT: .LBB95_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB95_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB95_2 +; GFX12-NEXT: .LBB95_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s7, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s6, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s2 ; GFX12-NEXT: s_endpgm entry: %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst @@ -12593,50 +13157,138 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB96_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v0, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s9 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB96_3 +; GCN1-NEXT: s_branch .LBB96_4 +; GCN1-NEXT: .LBB96_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB96_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB96_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB96_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v0, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s9 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB96_3 +; GCN2-NEXT: s_branch .LBB96_4 +; GCN2-NEXT: .LBB96_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB96_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB96_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpxchg_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s8, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 +; GFX12-NEXT: s_cbranch_vccz .LBB96_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB96_3 +; GFX12-NEXT: s_branch .LBB96_4 +; GFX12-NEXT: .LBB96_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB96_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v3, v1, s5, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v0, s4, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB96_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -12650,52 +13302,148 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[8:9], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB97_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB97_4 +; GCN1-NEXT: .LBB97_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB97_3: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s10 +; GCN1-NEXT: v_mov_b32_e32 v3, s11 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB97_2 +; GCN1-NEXT: .LBB97_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_cmpxchg_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[8:9], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB97_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB97_4 +; GCN2-NEXT: .LBB97_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB97_3: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s10 +; GCN2-NEXT: v_mov_b32_e32 v3, s11 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB97_2 +; GCN2-NEXT: .LBB97_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpxchg_i64_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB97_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB97_4 +; GFX12-NEXT: .LBB97_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB97_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB97_2 +; GFX12-NEXT: .LBB97_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -12706,12 +13454,24 @@ entry: define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) { ; GCN1-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s18, -1 +; GCN1-NEXT: s_mov_b32 s19, 0xe8f000 +; GCN1-NEXT: s_add_u32 s16, s16, s9 ; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s12, s[2:3], 0x43 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x11 +; GCN1-NEXT: s_addc_u32 s17, s17, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GCN1-NEXT: s_add_u32 s2, s4, s2 ; GCN1-NEXT: s_addc_u32 s3, s5, s3 +; GCN1-NEXT: s_cmp_eq_u32 s3, s12 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB98_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v5, s3 ; GCN1-NEXT: v_mov_b32_e32 v0, s8 ; GCN1-NEXT: v_mov_b32_e32 v1, s9 @@ -12721,6 +13481,28 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB98_3 +; GCN1-NEXT: s_branch .LBB98_4 +; GCN1-NEXT: .LBB98_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB98_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen +; GCN1-NEXT: .LBB98_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -12728,12 +13510,24 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; ; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 ; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s12, s[2:3], 0x10c ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x44 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 ; GCN2-NEXT: s_add_u32 s2, s4, s2 ; GCN2-NEXT: s_addc_u32 s3, s5, s3 +; GCN2-NEXT: s_cmp_eq_u32 s3, s12 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB98_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v5, s3 ; GCN2-NEXT: v_mov_b32_e32 v0, s8 ; GCN2-NEXT: v_mov_b32_e32 v1, s9 @@ -12743,6 +13537,27 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB98_3 +; GCN2-NEXT: s_branch .LBB98_4 +; GCN2-NEXT: .LBB98_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB98_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB98_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -12755,13 +13570,35 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x44 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: s_mov_b64 s[10:11], src_private_base ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s11 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB98_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB98_3 +; GFX12-NEXT: s_branch .LBB98_4 +; GFX12-NEXT: .LBB98_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB98_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v3, v1, s9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v0, s8, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB98_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll index 9e9503dfbd3819..380f376ce9c801 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll @@ -43,7 +43,7 @@ define i64 @test_flat_atomicrmw_sub_0_i64_agent(ptr %ptr) { ; ALL: [[ATOMICRMW_PRIVATE]]: ; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) ; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 -; ALL-NEXT: [[NEW:%.*]] = sub i64 [[LOADED_PRIVATE]], 0 +; ALL-NEXT: [[NEW:%.*]] = add i64 [[LOADED_PRIVATE]], 0 ; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 ; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] ; ALL: [[ATOMICRMW_GLOBAL]]: @@ -67,7 +67,7 @@ define i64 @test_flat_atomicrmw_or_0_i64_agent(ptr %ptr) { ; ALL: [[ATOMICRMW_PRIVATE]]: ; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) ; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 -; ALL-NEXT: [[NEW:%.*]] = or i64 [[LOADED_PRIVATE]], 0 +; ALL-NEXT: [[NEW:%.*]] = add i64 [[LOADED_PRIVATE]], 0 ; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 ; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] ; ALL: [[ATOMICRMW_GLOBAL]]: @@ -91,7 +91,7 @@ define i64 @test_flat_atomicrmw_xor_0_i64_agent(ptr %ptr) { ; ALL: [[ATOMICRMW_PRIVATE]]: ; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) ; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 -; ALL-NEXT: [[NEW:%.*]] = xor i64 [[LOADED_PRIVATE]], 0 +; ALL-NEXT: [[NEW:%.*]] = add i64 [[LOADED_PRIVATE]], 0 ; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 ; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] ; ALL: [[ATOMICRMW_GLOBAL]]: diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll index 80058b3cef4ea1..2bfcc5897c3824 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll @@ -40,14 +40,14 @@ define i32 @test_atomicrmw_or_0_flat_system(ptr %ptr) { ; CHECK: atomicrmw.private: ; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) ; CHECK-NEXT: [[LOADED_PRIVATE:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4 -; CHECK-NEXT: [[NEW:%.*]] = or i32 [[LOADED_PRIVATE]], 0 +; CHECK-NEXT: [[NEW:%.*]] = add i32 [[LOADED_PRIVATE]], 0 ; CHECK-NEXT: store i32 [[NEW]], ptr addrspace(5) [[TMP1]], align 4 ; CHECK-NEXT: br label [[ATOMICRMW_PHI:%.*]] ; CHECK: atomicrmw.global: -; CHECK-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i32 0 seq_cst, align 4, !noalias.addrspace [[META1:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[PTR]], i32 0 seq_cst, align 4, !noalias.addrspace [[META1:![0-9]+]] ; CHECK-NEXT: br label [[ATOMICRMW_PHI]] ; CHECK: atomicrmw.phi: -; CHECK-NEXT: [[RES1:%.*]] = phi i32 [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[RES]], [[ATOMICRMW_GLOBAL]] ] +; CHECK-NEXT: [[RES1:%.*]] = phi i32 [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ] ; CHECK-NEXT: br label [[ATOMICRMW_END:%.*]] ; CHECK: atomicrmw.end: ; CHECK-NEXT: ret i32 [[RES1]] diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-cmpxchg-flat-maybe-private.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-cmpxchg-flat-maybe-private.ll index 6b3c27be8688c2..ff550cbfd75d4f 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-cmpxchg-flat-maybe-private.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-cmpxchg-flat-maybe-private.ll @@ -84,7 +84,24 @@ define { i32, i1 } @cmpxchg_flat_agent_i32(ptr %ptr, i32 %val, i32 %swap) { define { i64, i1 } @cmpxchg_flat_agent_i64(ptr %ptr, i64 %val, i64 %swap) { ; CHECK-LABEL: define { i64, i1 } @cmpxchg_flat_agent_i64( ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[VAL:%.*]], i64 [[SWAP:%.*]]) { -; CHECK-NEXT: [[RESULT:%.*]] = cmpxchg ptr [[PTR]], i64 [[VAL]], i64 [[SWAP]] syncscope("agent") monotonic seq_cst, align 8 +; CHECK-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; CHECK-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; CHECK: [[ATOMICRMW_PRIVATE]]: +; CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr addrspace(5) [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[TMP4]], [[VAL]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[SWAP]], i64 [[TMP4]] +; CHECK-NEXT: store i64 [[TMP6]], ptr addrspace(5) [[TMP3]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { i64, i1 } poison, i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { i64, i1 } [[TMP7]], i1 [[TMP5]], 1 +; CHECK-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; CHECK: [[ATOMICRMW_GLOBAL]]: +; CHECK-NEXT: [[TMP9:%.*]] = cmpxchg ptr [[PTR]], i64 [[VAL]], i64 [[SWAP]] syncscope("agent") monotonic seq_cst, align 8, !noalias.addrspace [[META0:![0-9]+]] +; CHECK-NEXT: br label %[[ATOMICRMW_PHI]] +; CHECK: [[ATOMICRMW_PHI]]: +; CHECK-NEXT: [[RESULT:%.*]] = phi { i64, i1 } [ [[TMP8]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP9]], %[[ATOMICRMW_GLOBAL]] ] +; CHECK-NEXT: br label %[[ATOMICRMW_END:.*]] +; CHECK: [[ATOMICRMW_END]]: ; CHECK-NEXT: ret { i64, i1 } [[RESULT]] ; %result = cmpxchg ptr %ptr, i64 %val, i64 %swap syncscope("agent") monotonic seq_cst @@ -94,7 +111,24 @@ define { i64, i1 } @cmpxchg_flat_agent_i64(ptr %ptr, i64 %val, i64 %swap) { define { i64, i1 } @cmpxchg_flat_agent_i64_volatile(ptr %ptr, i64 %val, i64 %swap) { ; CHECK-LABEL: define { i64, i1 } @cmpxchg_flat_agent_i64_volatile( ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[VAL:%.*]], i64 [[SWAP:%.*]]) { -; CHECK-NEXT: [[RESULT:%.*]] = cmpxchg volatile ptr [[PTR]], i64 [[VAL]], i64 [[SWAP]] syncscope("agent") monotonic seq_cst, align 8 +; CHECK-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; CHECK-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; CHECK: [[ATOMICRMW_PRIVATE]]: +; CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr addrspace(5) [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[TMP4]], [[VAL]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[SWAP]], i64 [[TMP4]] +; CHECK-NEXT: store i64 [[TMP6]], ptr addrspace(5) [[TMP3]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { i64, i1 } poison, i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { i64, i1 } [[TMP7]], i1 [[TMP5]], 1 +; CHECK-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; CHECK: [[ATOMICRMW_GLOBAL]]: +; CHECK-NEXT: [[TMP9:%.*]] = cmpxchg volatile ptr [[PTR]], i64 [[VAL]], i64 [[SWAP]] syncscope("agent") monotonic seq_cst, align 8, !noalias.addrspace [[META0]] +; CHECK-NEXT: br label %[[ATOMICRMW_PHI]] +; CHECK: [[ATOMICRMW_PHI]]: +; CHECK-NEXT: [[RESULT:%.*]] = phi { i64, i1 } [ [[TMP8]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP9]], %[[ATOMICRMW_GLOBAL]] ] +; CHECK-NEXT: br label %[[ATOMICRMW_END:.*]] +; CHECK: [[ATOMICRMW_END]]: ; CHECK-NEXT: ret { i64, i1 } [[RESULT]] ; %result = cmpxchg volatile ptr %ptr, i64 %val, i64 %swap syncscope("agent") monotonic seq_cst @@ -144,7 +178,7 @@ define { i16, i1 } @cmpxchg_flat_agent_i16__noprivate(ptr %ptr, i16 %val, i16 %s define { i32, i1 } @cmpxchg_flat_agent_i32__noprivate(ptr %ptr, i32 %val, i32 %swap) { ; CHECK-LABEL: define { i32, i1 } @cmpxchg_flat_agent_i32__noprivate( ; CHECK-SAME: ptr [[PTR:%.*]], i32 [[VAL:%.*]], i32 [[SWAP:%.*]]) { -; CHECK-NEXT: [[RESULT:%.*]] = cmpxchg ptr [[PTR]], i32 [[VAL]], i32 [[SWAP]] syncscope("agent") monotonic seq_cst, align 4, !noalias.addrspace [[META0:![0-9]+]] +; CHECK-NEXT: [[RESULT:%.*]] = cmpxchg ptr [[PTR]], i32 [[VAL]], i32 [[SWAP]] syncscope("agent") monotonic seq_cst, align 4, !noalias.addrspace [[META0]] ; CHECK-NEXT: ret { i32, i1 } [[RESULT]] ; %result = cmpxchg ptr %ptr, i32 %val, i32 %swap syncscope("agent") monotonic seq_cst, !noalias.addrspace !0 @@ -164,7 +198,24 @@ define { i64, i1 } @cmpxchg_flat_agent_i64__noprivate(ptr %ptr, i64 %val, i64 %s define { i64, i1 } @cmpxchg_flat_agent_i64__nolocal(ptr %ptr, i64 %val, i64 %swap) { ; CHECK-LABEL: define { i64, i1 } @cmpxchg_flat_agent_i64__nolocal( ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[VAL:%.*]], i64 [[SWAP:%.*]]) { -; CHECK-NEXT: [[RESULT:%.*]] = cmpxchg ptr [[PTR]], i64 [[VAL]], i64 [[SWAP]] syncscope("agent") monotonic seq_cst, align 8, !noalias.addrspace [[META1:![0-9]+]] +; CHECK-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; CHECK-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; CHECK: [[ATOMICRMW_PRIVATE]]: +; CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr addrspace(5) [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[TMP4]], [[VAL]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[SWAP]], i64 [[TMP4]] +; CHECK-NEXT: store i64 [[TMP6]], ptr addrspace(5) [[TMP3]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { i64, i1 } poison, i64 [[TMP4]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { i64, i1 } [[TMP7]], i1 [[TMP5]], 1 +; CHECK-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; CHECK: [[ATOMICRMW_GLOBAL]]: +; CHECK-NEXT: [[TMP9:%.*]] = cmpxchg ptr [[PTR]], i64 [[VAL]], i64 [[SWAP]] syncscope("agent") monotonic seq_cst, align 8, !noalias.addrspace [[META0]] +; CHECK-NEXT: br label %[[ATOMICRMW_PHI]] +; CHECK: [[ATOMICRMW_PHI]]: +; CHECK-NEXT: [[RESULT:%.*]] = phi { i64, i1 } [ [[TMP8]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP9]], %[[ATOMICRMW_GLOBAL]] ] +; CHECK-NEXT: br label %[[ATOMICRMW_END:.*]] +; CHECK: [[ATOMICRMW_END]]: ; CHECK-NEXT: ret { i64, i1 } [[RESULT]] ; %result = cmpxchg ptr %ptr, i64 %val, i64 %swap syncscope("agent") monotonic seq_cst, !noalias.addrspace !1 @@ -174,7 +225,24 @@ define { i64, i1 } @cmpxchg_flat_agent_i64__nolocal(ptr %ptr, i64 %val, i64 %swa define { i64, i1 } @cmpxchg_flat_agent_i64_mmra(ptr %ptr, i64 %val, i64 %swap) { ; CHECK-LABEL: define { i64, i1 } @cmpxchg_flat_agent_i64_mmra( ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[VAL:%.*]], i64 [[SWAP:%.*]]) { -; CHECK-NEXT: [[RESULT:%.*]] = cmpxchg ptr [[PTR]], i64 [[VAL]], i64 [[SWAP]] syncscope("agent") monotonic seq_cst, align 8, !mmra [[META2:![0-9]+]] +; CHECK-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; CHECK-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; CHECK: [[ATOMICRMW_PRIVATE]]: +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[TMP2]], [[VAL]] +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[SWAP]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP4]], ptr addrspace(5) [[TMP1]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { i64, i1 } poison, i64 [[TMP2]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { i64, i1 } [[TMP5]], i1 [[TMP3]], 1 +; CHECK-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; CHECK: [[ATOMICRMW_GLOBAL]]: +; CHECK-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[VAL]], i64 [[SWAP]] syncscope("agent") monotonic seq_cst, align 8, !mmra [[META1:![0-9]+]], !noalias.addrspace [[META0]] +; CHECK-NEXT: br label %[[ATOMICRMW_PHI]] +; CHECK: [[ATOMICRMW_PHI]]: +; CHECK-NEXT: [[RESULT:%.*]] = phi { i64, i1 } [ [[TMP6]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP7]], %[[ATOMICRMW_GLOBAL]] ] +; CHECK-NEXT: br label %[[ATOMICRMW_END:.*]] +; CHECK: [[ATOMICRMW_END]]: ; CHECK-NEXT: ret { i64, i1 } [[RESULT]] ; %result = cmpxchg ptr %ptr, i64 %val, i64 %swap syncscope("agent") monotonic seq_cst, !mmra !4 @@ -184,7 +252,24 @@ define { i64, i1 } @cmpxchg_flat_agent_i64_mmra(ptr %ptr, i64 %val, i64 %swap) { define { i64, i1 } @cmpxchg_flat_agent_i64_mmra_noprivate(ptr %ptr, i64 %val, i64 %swap) { ; CHECK-LABEL: define { i64, i1 } @cmpxchg_flat_agent_i64_mmra_noprivate( ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[VAL:%.*]], i64 [[SWAP:%.*]]) { -; CHECK-NEXT: [[RESULT:%.*]] = cmpxchg ptr [[PTR]], i64 [[VAL]], i64 [[SWAP]] syncscope("agent") monotonic seq_cst, align 8, !mmra [[META2]], !noalias.addrspace [[META1]] +; CHECK-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; CHECK-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; CHECK: [[ATOMICRMW_PRIVATE]]: +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[TMP2]], [[VAL]] +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 [[SWAP]], i64 [[TMP2]] +; CHECK-NEXT: store i64 [[TMP4]], ptr addrspace(5) [[TMP1]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { i64, i1 } poison, i64 [[TMP2]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { i64, i1 } [[TMP5]], i1 [[TMP3]], 1 +; CHECK-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; CHECK: [[ATOMICRMW_GLOBAL]]: +; CHECK-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[VAL]], i64 [[SWAP]] syncscope("agent") monotonic seq_cst, align 8, !mmra [[META1]], !noalias.addrspace [[META0]] +; CHECK-NEXT: br label %[[ATOMICRMW_PHI]] +; CHECK: [[ATOMICRMW_PHI]]: +; CHECK-NEXT: [[RESULT:%.*]] = phi { i64, i1 } [ [[TMP6]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP7]], %[[ATOMICRMW_GLOBAL]] ] +; CHECK-NEXT: br label %[[ATOMICRMW_END:.*]] +; CHECK: [[ATOMICRMW_END]]: ; CHECK-NEXT: ret { i64, i1 } [[RESULT]] ; %result = cmpxchg ptr %ptr, i64 %val, i64 %swap syncscope("agent") monotonic seq_cst, !noalias.addrspace !1, !mmra !4 @@ -201,8 +286,7 @@ define { i64, i1 } @cmpxchg_flat_agent_i64_mmra_noprivate(ptr %ptr, i64 %val, i6 ;. ; CHECK: [[META0]] = !{i32 5, i32 6} -; CHECK: [[META1]] = !{i32 3, i32 4} -; CHECK: [[META2]] = !{[[META3:![0-9]+]], [[META4:![0-9]+]]} -; CHECK: [[META3]] = !{!"foo", !"bar"} -; CHECK: [[META4]] = !{!"bux", !"baz"} +; CHECK: [[META1]] = !{[[META2:![0-9]+]], [[META3:![0-9]+]]} +; CHECK: [[META2]] = !{!"foo", !"bar"} +; CHECK: [[META3]] = !{!"bux", !"baz"} ;. >From 2793c97b42bd3183a2422b35c0603bba9ea83d63 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Mon, 30 Sep 2024 11:08:44 +0400 Subject: [PATCH 2/2] Rename helper function --- llvm/include/llvm/Transforms/Utils/LowerAtomic.h | 7 +++---- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 ++-- llvm/lib/Transforms/Utils/LowerAtomic.cpp | 9 +++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/LowerAtomic.h b/llvm/include/llvm/Transforms/Utils/LowerAtomic.h index 295c2bd2b4b47e..d34f8c7fd88822 100644 --- a/llvm/include/llvm/Transforms/Utils/LowerAtomic.h +++ b/llvm/include/llvm/Transforms/Utils/LowerAtomic.h @@ -25,10 +25,9 @@ bool lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI); /// Emit IR to implement the given cmpxchg operation on values in registers, /// returning the new value. -std::pair<Value *, Value *> buildAtomicCmpXchgValue(IRBuilderBase &Builder, - Value *Ptr, Value *Cmp, - Value *Val, - Align Alignment); +std::pair<Value *, Value *> buildCmpXchgValue(IRBuilderBase &Builder, + Value *Ptr, Value *Cmp, + Value *Val, Align Alignment); /// Convert the given RMWI into primitive load and stores, /// assuming that doing so is legal. Return true if the lowering diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index c8a46875bda408..204ffa2920eefe 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16895,8 +16895,8 @@ void SITargetLowering::emitExpandAtomicAddrSpacePredicate( Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign()); } else { auto [ResultLoad, Equal] = - buildAtomicCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(), - CX->getNewValOperand(), CX->getAlign()); + buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(), + CX->getNewValOperand(), CX->getAlign()); Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()), ResultLoad, 0); diff --git a/llvm/lib/Transforms/Utils/LowerAtomic.cpp b/llvm/lib/Transforms/Utils/LowerAtomic.cpp index 89c49d4a0732f6..b51c32485411da 100644 --- a/llvm/lib/Transforms/Utils/LowerAtomic.cpp +++ b/llvm/lib/Transforms/Utils/LowerAtomic.cpp @@ -26,7 +26,7 @@ bool llvm::lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { Value *Val = CXI->getNewValOperand(); auto [Orig, Equal] = - buildAtomicCmpXchgValue(Builder, Ptr, Cmp, Val, CXI->getAlign()); + buildCmpXchgValue(Builder, Ptr, Cmp, Val, CXI->getAlign()); Value *Res = Builder.CreateInsertValue(PoisonValue::get(CXI->getType()), Orig, 0); @@ -37,9 +37,10 @@ bool llvm::lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { return true; } -std::pair<Value *, Value *> -llvm::buildAtomicCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, - Value *Val, Align Alignment) { +std::pair<Value *, Value *> llvm::buildCmpXchgValue(IRBuilderBase &Builder, + Value *Ptr, Value *Cmp, + Value *Val, + Align Alignment) { LoadInst *Orig = Builder.CreateAlignedLoad(Val->getType(), Ptr, Alignment); Value *Equal = Builder.CreateICmpEQ(Orig, Cmp); Value *Res = Builder.CreateSelect(Equal, Val, Orig); _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits