diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index bddb6e822b81b..8d7eedaa0d4d9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -39,6 +39,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/ModRef.h" @@ -16308,12 +16309,45 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) { : TargetLowering::AtomicExpansionKind::CmpXChg; } +/// Return if a flat address space atomicrmw can access private memory. +static bool flatInstrMayAccessPrivate(const Instruction *I) { + const MDNode *NoaliasAddrSpaceMD = + I->getMetadata(LLVMContext::MD_noalias_addrspace); + if (!NoaliasAddrSpaceMD) + return true; + + for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E; + ++I) { + auto *Low = mdconst::extract( + NoaliasAddrSpaceMD->getOperand(2 * I + 0)); + auto *High = mdconst::extract( + NoaliasAddrSpaceMD->getOperand(2 * I + 1)); + + if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS) && + High->getValue().ult(AMDGPUAS::PRIVATE_ADDRESS)) + return true; + } + + return false; +} + TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { unsigned AS = RMW->getPointerAddressSpace(); if (AS == AMDGPUAS::PRIVATE_ADDRESS) return AtomicExpansionKind::NotAtomic; + // 64-bit flat atomics that dynamically reside in private memory will silently + // be dropped. + // + // Note that we will emit a new copy of the original atomic in the expansion, + // which will be incrementally relegalized. + const DataLayout &DL = RMW->getFunction()->getDataLayout(); + if (AS == AMDGPUAS::FLAT_ADDRESS && + DL.getTypeSizeInBits(RMW->getType()) == 64 && + flatInstrMayAccessPrivate(RMW)) + return AtomicExpansionKind::Expand; + auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) { OptimizationRemarkEmitter ORE(RMW->getFunction()); ORE.emit([=]() { @@ -16714,20 +16748,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor) { - // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0 - assert(cast(AI->getValOperand())->isNullValue() && - "this cannot be replaced with add"); - AI->setOperation(AtomicRMWInst::Add); - return; + if (auto *ConstVal = dyn_cast(AI->getValOperand()); + ConstVal && ConstVal->isNullValue()) { + // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0 + AI->setOperation(AtomicRMWInst::Add); + + // TODO: Turn the below private handling into a no-op for idempotent + // cases. + } } - assert(Subtarget->hasAtomicFaddInsts() && - "target should have atomic fadd instructions"); - assert(AI->getType()->isFloatTy() && - AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS && - "generic atomicrmw expansion only supports FP32 operand in flat " - "address space"); - assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now"); + // The non-flat expansions should only perform the de-canonicalization of + // identity values. + if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS) + return; + + // FullFlatEmulation is true if we need to issue the private, shared, and + // global cases. + // + // If this is false, we are only dealing with the flat-targeting-private case, + // where we only insert a check for private and still use the flat instruction + // for global and shared. + + // TODO: Avoid the private check for the fadd case depending on + // noalias.addrspace. + + bool FullFlatEmulation = Op == AtomicRMWInst::FAdd && + Subtarget->hasAtomicFaddInsts() && + AI->getType()->isFloatTy(); // Given: atomicrmw fadd ptr %addr, float %val ordering // @@ -16767,6 +16815,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { // // atomicrmw.end: // [...] + // + // + // For 64-bit atomics which may reside in private memory, we perform a simpler + // version that only inserts the private check, and uses the flat operation. IRBuilder<> Builder(AI); LLVMContext &Ctx = Builder.getContext(); @@ -16778,9 +16830,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { Function *F = BB->getParent(); BasicBlock *ExitBB = BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); - BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB); - BasicBlock *CheckPrivateBB = - BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB); + BasicBlock *SharedBB = nullptr; + + BasicBlock *CheckPrivateBB = BB; + if (FullFlatEmulation) { + SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB); + CheckPrivateBB = + BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB); + } + BasicBlock *PrivateBB = BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB); BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB); @@ -16793,23 +16851,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { std::prev(BB->end())->eraseFromParent(); Builder.SetInsertPoint(BB); - CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {}, - {Addr}, nullptr, "is.shared"); - Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB); - Builder.SetInsertPoint(SharedBB); - Value *CastToLocal = Builder.CreateAddrSpaceCast( - Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS)); + Value *LoadedShared = nullptr; + if (FullFlatEmulation) { + CallInst *IsShared = Builder.CreateIntrinsic( + Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared"); + Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB); + Builder.SetInsertPoint(SharedBB); + Value *CastToLocal = Builder.CreateAddrSpaceCast( + Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS)); - Instruction *Clone = AI->clone(); - Clone->insertInto(SharedBB, SharedBB->end()); - Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex()) - .set(CastToLocal); - Instruction *LoadedShared = Clone; + Instruction *Clone = AI->clone(); + Clone->insertInto(SharedBB, SharedBB->end()); + Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex()) + .set(CastToLocal); + LoadedShared = Clone; - Builder.CreateBr(PhiBB); + Builder.CreateBr(PhiBB); + Builder.SetInsertPoint(CheckPrivateBB); + } - Builder.SetInsertPoint(CheckPrivateBB); CallInst *IsPrivate = Builder.CreateIntrinsic( Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private"); Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB); @@ -16826,15 +16887,32 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { Builder.CreateBr(PhiBB); Builder.SetInsertPoint(GlobalBB); - Value *CastToGlobal = Builder.CreateAddrSpaceCast( - Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS)); - Value *LoadedGlobal = AI; - AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal); + // Continue using a flat instruction if we only emitted the check for private. + Instruction *LoadedGlobal = AI; + if (FullFlatEmulation) { + Value *CastToGlobal = Builder.CreateAddrSpaceCast( + Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS)); + AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()) + .set(CastToGlobal); + } AI->removeFromParent(); AI->insertInto(GlobalBB, GlobalBB->end()); + // The new atomicrmw may go through another round of legalization later. + if (!FullFlatEmulation) { + // We inserted the runtime check already, make sure we do not try to + // re-expand this. + // TODO: Should union with any existing metadata. + MDBuilder MDB(F->getContext()); + MDNode *RangeNotPrivate = + MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS), + APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1)); + LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace, + RangeNotPrivate); + } + Builder.CreateBr(PhiBB); Builder.SetInsertPoint(PhiBB); @@ -16842,7 +16920,8 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { if (ReturnValueIsUsed) { PHINode *Loaded = Builder.CreatePHI(ValTy, 3); AI->replaceAllUsesWith(Loaded); - Loaded->addIncoming(LoadedShared, SharedBB); + if (FullFlatEmulation) + Loaded->addIncoming(LoadedShared, SharedBB); Loaded->addIncoming(LoadedPrivate, PrivateBB); Loaded->addIncoming(LoadedGlobal, GlobalBB); Loaded->takeName(AI); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index 43266554c2d8a..d38a9051175be 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -1332,7 +1332,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %result } @@ -1482,7 +1482,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2215,3 +2215,4 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index 9be4fec5a3b95..3678eb5ac7682 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -1332,7 +1332,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %result } @@ -1482,7 +1482,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2215,3 +2215,4 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll index e28a1efb75404..ea44612465579 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -1645,7 +1645,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 ret void } @@ -1747,7 +1747,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 ret void } @@ -1820,7 +1820,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -1899,7 +1899,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -1978,7 +1978,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -2106,7 +2106,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % %gep.tid = getelementptr i64, ptr %ptr, i32 %id %out.gep = getelementptr i64, ptr %out, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out.gep, align 4 ret void } @@ -2205,7 +2205,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -3312,7 +3312,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0 - %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i32 %idx.0, ptr addrspace(1) %add_use, align 4 store i64 %result, ptr addrspace(1) %out, align 4 ret void @@ -3321,5 +3321,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, attributes #0 = { nounwind speculatable willreturn memory(none) } attributes #1 = { nounwind } attributes #2 = { nounwind memory(none) } + +!0 = !{i32 5, i32 6} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index d63044d7cec6d..4023e053c66c5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -2754,7 +2754,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm - %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 ret void } @@ -2856,7 +2856,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 ret void } @@ -2958,7 +2958,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 ret void } @@ -3031,7 +3031,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -3110,7 +3110,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -3189,7 +3189,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -3317,7 +3317,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % %gep.tid = getelementptr i64, ptr %ptr, i32 %id %out.gep = getelementptr i64, ptr %out, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out.gep, align 4 ret void } @@ -3416,7 +3416,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -3524,5 +3524,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, attributes #0 = { nounwind speculatable willreturn memory(none) } attributes #1 = { nounwind } attributes #2 = { nounwind memory(none) } + +!0 = !{i32 5, i32 6} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll index c1cb74cb0e25a..c9ab351f94016 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll @@ -15,7 +15,7 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -38,8 +38,9 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 - %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 92ce2af47e22a..605c8f7e36919 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1371,7 +1371,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1400,7 +1400,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1431,7 +1431,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1458,7 +1458,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret } @@ -1483,7 +1483,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret } @@ -1512,7 +1512,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret } @@ -1541,7 +1541,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1713,3 +1713,4 @@ attributes #1 = { nounwind } attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll index 32cb1056022de..d64becc74ddc2 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=finalize-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -enable-new-pm -stop-after=finalize-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_intrinsic(ptr %ptr, double %data) { ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_intrinsic @@ -73,7 +71,7 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -123,7 +121,7 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 - %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret } diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 61cac642d19e8..ff48a3fc98018 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -5669,9 +5669,17 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execz .LBB30_4 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB30_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -5683,11 +5691,27 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB30_2 +; GFX12-NEXT: ; %bb.3: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: .LBB30_4: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB30_6 +; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[0:1], v[4:5], v[2:3] +; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX12-NEXT: .LBB30_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -5696,18 +5720,54 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB30_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB30_4 +; GFX940-NEXT: .LBB30_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB30_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB30_2 +; GFX940-NEXT: .LBB30_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB30_4 +; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB30_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -5719,11 +5779,25 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB30_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB30_2 +; GFX11-NEXT: ; %bb.3: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: .LBB30_4: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB30_6 +; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3] +; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX11-NEXT: .LBB30_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5731,9 +5805,16 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execz .LBB30_4 +; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB30_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 @@ -5745,10 +5826,28 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB30_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB30_2 +; GFX10-NEXT: ; %bb.3: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: .LBB30_4: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB30_6 +; GFX10-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: .LBB30_6: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 @@ -5757,17 +5856,54 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB30_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB30_4 +; GFX90A-NEXT: .LBB30_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB30_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB30_2 +; GFX90A-NEXT: .LBB30_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB30_4 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB30_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -5777,24 +5913,50 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB30_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB30_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: .LBB30_4: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB30_6 +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: .LBB30_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB30_4 +; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB30_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -5804,24 +5966,51 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB30_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB30_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: .LBB30_4: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB30_6 +; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: .LBB30_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB30_4 +; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v5, v[4:5] ; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: flat_load_dword v5, v[5:6] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB30_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -5831,13 +6020,31 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB30_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB30_2 +; GFX7-NEXT: ; %bb.3: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: .LBB30_4: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB30_6 +; GFX7-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[0:1], v[4:5], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: .LBB30_6: ; %atomicrmw.phi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result @@ -5851,63 +6058,151 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB31_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB31_6 +; GFX12-NEXT: .LBB31_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB31_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB31_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB31_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB31_2 +; GFX12-NEXT: .LBB31_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB31_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB31_4 +; GFX940-NEXT: .LBB31_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB31_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:2040 sc0 +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB31_2 +; GFX940-NEXT: .LBB31_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB31_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB31_6 +; GFX11-NEXT: .LBB31_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB31_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB31_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] +; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB31_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB31_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB31_2 +; GFX11-NEXT: .LBB31_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5915,9 +6210,22 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB31_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB31_6 +; GFX10-NEXT: .LBB31_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB31_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB31_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v9, v1 @@ -5929,56 +6237,145 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB31_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB31_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB31_2 +; GFX10-NEXT: .LBB31_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:2040 glc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB31_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB31_4 +; GFX90A-NEXT: .LBB31_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB31_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB31_2 +; GFX90A-NEXT: .LBB31_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB31_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB31_6 +; GFX908-NEXT: .LBB31_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB31_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB31_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB31_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB31_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB31_2 +; GFX908-NEXT: .LBB31_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB31_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB31_6 +; GFX8-NEXT: .LBB31_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB31_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB31_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -5988,24 +6385,55 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB31_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB31_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB31_2 +; GFX8-NEXT: .LBB31_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7f8, v0 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB31_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB31_6 +; GFX7-NEXT: .LBB31_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB31_3: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB31_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -6015,11 +6443,27 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB31_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB31_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB31_2 +; GFX7-NEXT: .LBB31_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6034,70 +6478,151 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:-2048 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB32_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB32_6 +; GFX12-NEXT: .LBB32_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB32_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB32_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] +; GFX12-NEXT: v_add_f64_e32 v[6:7], v[8:9], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB32_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB32_2 +; GFX12-NEXT: .LBB32_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB32_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB32_4 +; GFX940-NEXT: .LBB32_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB32_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB32_2 +; GFX940-NEXT: .LBB32_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo -; GFX11-NEXT: v_add_co_u32 v5, vcc_lo, 0xfffff800, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, -1, v4, vcc_lo -; GFX11-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB32_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB32_6 +; GFX11-NEXT: .LBB32_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB32_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB32_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[2:3] +; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[5:6], v[7:10] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB32_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB32_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-NEXT: .LBB32_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6106,9 +6631,22 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB32_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB32_6 +; GFX10-NEXT: .LBB32_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB32_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB32_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v9, v1 @@ -6120,21 +6658,65 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB32_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB32_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB32_2 +; GFX10-NEXT: .LBB32_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB32_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB32_4 +; GFX90A-NEXT: .LBB32_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB32_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB32_2 +; GFX90A-NEXT: .LBB32_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6142,38 +6724,79 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v9, v1 -; GFX908-NEXT: v_mov_b32_e32 v8, v0 -; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB32_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB32_6 +; GFX908-NEXT: .LBB32_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB32_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB32_4: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB32_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB32_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB32_2 +; GFX908-NEXT: .LBB32_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB32_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB32_6 +; GFX8-NEXT: .LBB32_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB32_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB32_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -6183,24 +6806,55 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB32_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB32_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB32_2 +; GFX8-NEXT: .LBB32_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB32_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB32_6 +; GFX7-NEXT: .LBB32_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB32_3: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB32_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -6210,11 +6864,27 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB32_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB32_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB32_2 +; GFX7-NEXT: .LBB32_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6229,9 +6899,25 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB33_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB33_6 +; GFX12-NEXT: .LBB33_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB33_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] @@ -6242,11 +6928,24 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB33_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB33_2 +; GFX12-NEXT: .LBB33_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -6254,18 +6953,56 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB33_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB33_4 +; GFX940-NEXT: .LBB33_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB33_2 +; GFX940-NEXT: .LBB33_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB33_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB33_6 +; GFX11-NEXT: .LBB33_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB33_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -6276,20 +7013,44 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB33_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB33_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-NEXT: .LBB33_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB33_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB33_6 +; GFX10-NEXT: .LBB33_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB33_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -6301,27 +7062,82 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB33_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB33_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB33_2 +; GFX10-NEXT: .LBB33_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB33_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB33_4 +; GFX90A-NEXT: .LBB33_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB33_2 +; GFX90A-NEXT: .LBB33_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB33_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB33_6 +; GFX908-NEXT: .LBB33_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB33_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -6330,23 +7146,52 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB33_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB33_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB33_2 +; GFX908-NEXT: .LBB33_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB33_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB33_6 +; GFX8-NEXT: .LBB33_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: flat_load_dword v7, v[4:5] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB33_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -6355,23 +7200,53 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB33_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB33_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB33_2 +; GFX8-NEXT: .LBB33_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB33_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB33_6 +; GFX7-NEXT: .LBB33_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB33_3: ; %atomicrmw.global ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: flat_load_dword v7, v[4:5] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB33_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -6380,12 +7255,28 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB33_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB33_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB33_2 +; GFX7-NEXT: .LBB33_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fadd ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -6399,24 +7290,56 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] offset:2040 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB34_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB34_6 +; GFX12-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB34_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB34_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB34_2 +; GFX12-NEXT: .LBB34_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -6424,33 +7347,88 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB34_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB34_4 +; GFX940-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] offset:2040 +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB34_2 +; GFX940-NEXT: .LBB34_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] offset:2040 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB34_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB34_6 +; GFX11-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB34_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB34_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB34_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-NEXT: .LBB34_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6459,9 +7437,21 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB34_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB34_6 +; GFX10-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB34_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -6473,95 +7463,229 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB34_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB34_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB34_2 +; GFX10-NEXT: .LBB34_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] offset:2040 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB34_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB34_4 +; GFX90A-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB34_2 +; GFX90A-NEXT: .LBB34_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:2040 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB34_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB34_6 +; GFX908-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB34_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB34_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB34_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB34_2 +; GFX908-NEXT: .LBB34_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v7, v[0:1] -; GFX8-NEXT: flat_load_dword v6, v[8:9] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB34_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB34_6 +; GFX8-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB34_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB34_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB34_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB34_2 +; GFX8-NEXT: .LBB34_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x7f8, v0 -; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v7, v[0:1] -; GFX7-NEXT: flat_load_dword v6, v[8:9] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB34_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB34_6 +; GFX7-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB34_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB34_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB34_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB34_2 +; GFX7-NEXT: .LBB34_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -6576,24 +7700,56 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] offset:-2048 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB35_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB35_6 +; GFX12-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB35_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB35_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB35_2 +; GFX12-NEXT: .LBB35_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -6601,25 +7757,62 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB35_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB35_4 +; GFX940-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB35_2 +; GFX940-NEXT: .LBB35_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b64 v[6:7], v[4:5] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB35_6 +; GFX11-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB35_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -6630,11 +7823,23 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB35_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB35_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB35_2 +; GFX11-NEXT: .LBB35_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -6643,9 +7848,21 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB35_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB35_6 +; GFX10-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB35_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] @@ -6657,10 +7874,26 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB35_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB35_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB35_2 +; GFX10-NEXT: .LBB35_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -6669,89 +7902,201 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB35_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB35_4 +; GFX90A-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB35_2 +; GFX90A-NEXT: .LBB35_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v8, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v1, vcc ; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB35_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB35_6 +; GFX908-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB35_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB35_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB35_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB35_2 +; GFX908-NEXT: .LBB35_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v7, v[0:1] -; GFX8-NEXT: flat_load_dword v6, v[8:9] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB35_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB35_6 +; GFX8-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB35_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB35_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB35_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB35_2 +; GFX8-NEXT: .LBB35_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v7, v[0:1] -; GFX7-NEXT: flat_load_dword v6, v[8:9] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB35_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB35_6 +; GFX7-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB35_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB35_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB35_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB35_2 +; GFX7-NEXT: .LBB35_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index ad5498723940d..36aa73fbf8e92 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -2770,148 +2770,382 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execz .LBB18_4 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_2 +; GFX12-NEXT: ; %bb.3: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: .LBB18_4: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB18_6 +; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX12-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB18_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB18_4 +; GFX940-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB18_2 +; GFX940-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB18_4 +; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_2 +; GFX11-NEXT: ; %bb.3: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: .LBB18_4: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB18_6 +; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX11-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB18_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB18_4 +; GFX10-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[4:5], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB18_2 +; GFX10-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 +; GFX90A-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB18_2 +; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB18_4 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v3 +; GFX908-NEXT: v_mov_b32_e32 v8, v2 +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB18_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: .LBB18_4: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB18_6 +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB18_4 +; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB18_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: .LBB18_4: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB18_6 +; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB18_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB18_4 +; GFX7-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[4:5], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB18_2 +; GFX7-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result @@ -2925,127 +3159,329 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_6 +; GFX12-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB19_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB19_2 +; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB19_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB19_4 +; GFX940-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] offset:2040 sc0 +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB19_2 +; GFX940-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_6 +; GFX11-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB19_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX11-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB19_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB19_4 +; GFX10-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[4:5], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB19_2 +; GFX10-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] offset:2040 glc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_4 +; GFX90A-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB19_2 +; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040 ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_6 +; GFX908-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB19_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB19_2 +; GFX908-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB19_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB19_6 +; GFX8-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3056,21 +3492,71 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB19_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB19_2 +; GFX8-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7f8, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB19_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB19_4 +; GFX7-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[4:5], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB19_2 +; GFX7-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %result = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -3085,141 +3571,330 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB20_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB20_6 +; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[0:1], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB20_2 +; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB20_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB20_4 +; GFX940-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB20_2 +; GFX940-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo -; GFX11-NEXT: v_add_co_u32 v5, vcc_lo, 0xfffff800, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, -1, v4, vcc_lo -; GFX11-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB20_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB20_6 +; GFX11-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[5:6], v[7:10] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX11-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB20_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB20_4 +; GFX10-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[4:5], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB20_2 +; GFX10-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_4 +; GFX90A-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB20_2 +; GFX90A-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v5, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: s_mov_b64 s[4:5], vcc -; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v4, vcc -; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: v_addc_co_u32_e64 v6, vcc, -1, v4, s[4:5] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB20_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB20_6 +; GFX908-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB20_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB20_2 +; GFX908-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB20_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB20_6 +; GFX8-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3230,21 +3905,71 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB20_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB20_2 +; GFX8-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB20_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB20_4 +; GFX7-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[4:5], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB20_2 +; GFX7-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %result = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -3259,10 +3984,26 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB21_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB21_6 +; GFX12-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] @@ -3275,11 +4016,26 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB21_2 +; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[6:7] +; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3287,19 +4043,59 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB21_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB21_4 +; GFX940-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB21_2 +; GFX940-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB21_6 +; GFX11-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3312,40 +4108,126 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB21_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB21_4 +; GFX10-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB21_2 +; GFX10-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 +; GFX90A-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB21_2 +; GFX90A-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB21_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB21_6 +; GFX908-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3355,24 +4237,54 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB21_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB21_2 +; GFX908-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB21_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB21_6 +; GFX8-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[2:3] +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3382,20 +4294,69 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB21_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB21_2 +; GFX8-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB21_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB21_4 +; GFX7-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB21_2 +; GFX7-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -3409,27 +4370,61 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB22_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB22_6 +; GFX12-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB22_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB22_2 +; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3437,36 +4432,95 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB22_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB22_4 +; GFX940-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] offset:2040 +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB22_2 +; GFX940-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB22_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB22_6 +; GFX11-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB22_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: .LBB22_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3475,83 +4529,238 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB22_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB22_4 +; GFX10-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB22_2 +; GFX10-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] offset:2040 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB22_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB22_4 +; GFX90A-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB22_2 +; GFX90A-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB22_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB22_6 +; GFX908-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 glc +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB22_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB22_2 +; GFX908-NEXT: .LBB22_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: flat_load_dword v4, v[6:7] -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB22_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB22_6 +; GFX8-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[0:1] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB22_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB22_2 +; GFX8-NEXT: .LBB22_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB22_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB22_4 +; GFX7-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB22_2 +; GFX7-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %unused = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -3566,27 +4775,61 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB23_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB23_6 +; GFX12-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB23_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB23_2 +; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3594,43 +4837,96 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB23_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB23_4 +; GFX940-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB23_2 +; GFX940-NEXT: .LBB23_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: flat_load_b64 v[4:5], v[4:5] -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB23_6 +; GFX11-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB23_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-NEXT: .LBB23_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3639,12 +4935,43 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB23_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB23_4 +; GFX10-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB23_2 +; GFX10-NEXT: .LBB23_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3652,77 +4979,194 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB23_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB23_4 +; GFX90A-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB23_2 +; GFX90A-NEXT: .LBB23_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0 -; GFX908-NEXT: s_mov_b64 s[4:5], vcc -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX908-NEXT: v_addc_co_u32_e64 v7, vcc, -1, v1, s[4:5] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB23_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB23_6 +; GFX908-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB23_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[2:3], v[0:1], v[8:9] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[2:5] glc +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB23_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB23_2 +; GFX908-NEXT: .LBB23_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: flat_load_dword v4, v[6:7] -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB23_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB23_6 +; GFX8-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB23_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[0:1] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB23_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB23_2 +; GFX8-NEXT: .LBB23_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB23_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB23_4 +; GFX7-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB23_2 +; GFX7-NEXT: .LBB23_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %unused = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -3737,204 +5181,416 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execz .LBB24_4 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB24_2 +; GFX12-NEXT: ; %bb.3: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: .LBB24_4: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB24_6 +; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX12-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB24_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB24_4 +; GFX940-NEXT: .LBB24_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB24_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB24_2 +; GFX940-NEXT: .LBB24_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB24_2 +; GFX11-NEXT: ; %bb.3: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: .LBB24_4: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_6 +; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX11-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execz .LBB24_4 +; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB24_2 +; GFX10-NEXT: ; %bb.3: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: .LBB24_4: ; %Flow2 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB24_6 +; GFX10-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: .LBB24_6: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB24_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB24_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB24_4: ; %Flow2 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB24_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB24_4 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v3 +; GFX908-NEXT: v_mov_b32_e32 v8, v2 +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB24_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: .LBB24_4: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB24_6 +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB24_4 +; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB24_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: .LBB24_4: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB24_6 +; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: flat_load_dword v5, v[5:6] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB24_4 +; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[2:3] +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX7-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB24_2 +; GFX7-NEXT: ; %bb.3: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: .LBB24_4: ; %Flow2 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB24_6 +; GFX7-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret double %result @@ -3948,148 +5604,382 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execz .LBB25_4 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB25_2 +; GFX12-NEXT: ; %bb.3: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: .LBB25_4: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB25_6 +; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX12-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB25_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB25_4 +; GFX940-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB25_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB25_2 +; GFX940-NEXT: .LBB25_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB25_4 +; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB25_2 +; GFX11-NEXT: ; %bb.3: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: .LBB25_4: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB25_6 +; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX11-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB25_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB25_4 +; GFX10-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB25_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[4:5], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB25_2 +; GFX10-NEXT: .LBB25_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB25_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB25_4 +; GFX90A-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB25_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB25_2 +; GFX90A-NEXT: .LBB25_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB25_4 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v3 +; GFX908-NEXT: v_mov_b32_e32 v8, v2 +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB25_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: .LBB25_4: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB25_6 +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB25_4 +; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB25_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: .LBB25_4: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB25_6 +; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB25_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB25_4 +; GFX7-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB25_3: ; %atomicrmw.global +; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[4:5], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB25_2 +; GFX7-NEXT: .LBB25_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret double %result diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index dbf2626ec4d4f..d96d3db9f005d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -2770,148 +2770,382 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execz .LBB18_4 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_2 +; GFX12-NEXT: ; %bb.3: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: .LBB18_4: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB18_6 +; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX12-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB18_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB18_4 +; GFX940-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB18_2 +; GFX940-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB18_4 +; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_2 +; GFX11-NEXT: ; %bb.3: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: .LBB18_4: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB18_6 +; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX11-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB18_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB18_4 +; GFX10-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB18_2 +; GFX10-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX10-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 +; GFX90A-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB18_2 +; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB18_4 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v3 +; GFX908-NEXT: v_mov_b32_e32 v8, v2 +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB18_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: .LBB18_4: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB18_6 +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB18_4 +; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB18_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB18_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: .LBB18_4: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB18_6 +; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: .LBB18_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB18_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB18_4 +; GFX7-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB18_2 +; GFX7-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result @@ -2925,127 +3159,329 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_6 +; GFX12-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB19_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB19_2 +; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB19_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB19_4 +; GFX940-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] offset:2040 sc0 +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB19_2 +; GFX940-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_6 +; GFX11-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB19_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX11-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB19_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB19_4 +; GFX10-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB19_2 +; GFX10-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX10-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] offset:2040 glc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_4 +; GFX90A-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB19_2 +; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040 ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_6 +; GFX908-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB19_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB19_2 +; GFX908-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB19_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB19_6 +; GFX8-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3056,21 +3492,71 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB19_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB19_2 +; GFX8-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7f8, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB19_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB19_4 +; GFX7-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB19_2 +; GFX7-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -3085,141 +3571,330 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:-2048 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB20_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB20_6 +; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB20_2 +; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB20_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB20_4 +; GFX940-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB20_2 +; GFX940-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo -; GFX11-NEXT: v_add_co_u32 v5, vcc_lo, 0xfffff800, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, -1, v4, vcc_lo -; GFX11-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB20_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB20_6 +; GFX11-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[5:6], v[7:10] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX11-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB20_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB20_4 +; GFX10-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB20_2 +; GFX10-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX10-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_4 +; GFX90A-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB20_2 +; GFX90A-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v5, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_mov_b32_e32 v4, v1 -; GFX908-NEXT: s_mov_b64 s[4:5], vcc -; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v4, vcc -; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: v_addc_co_u32_e64 v6, vcc, -1, v4, s[4:5] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB20_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB20_6 +; GFX908-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v10, v1 -; GFX908-NEXT: v_mov_b32_e32 v9, v0 -; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] -; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[5:6], v[7:10] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] +; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB20_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB20_2 +; GFX908-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX908-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB20_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB20_6 +; GFX8-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3230,21 +3905,71 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB20_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB20_2 +; GFX8-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX8-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB20_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB20_4 +; GFX7-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB20_2 +; GFX7-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -3259,10 +3984,26 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB21_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB21_6 +; GFX12-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] @@ -3275,11 +4016,26 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB21_2 +; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[6:7] +; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3287,19 +4043,59 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB21_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB21_4 +; GFX940-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB21_2 +; GFX940-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] ; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB21_6 +; GFX11-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3312,40 +4108,126 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] ; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB21_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB21_4 +; GFX10-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB21_2 +; GFX10-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 +; GFX90A-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB21_2 +; GFX90A-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB21_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB21_6 +; GFX908-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3355,24 +4237,54 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX908-NEXT: v_mov_b32_e32 v5, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB21_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB21_2 +; GFX908-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB21_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB21_6 +; GFX8-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[2:3] +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] @@ -3382,20 +4294,69 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB21_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB21_2 +; GFX8-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB21_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB21_4 +; GFX7-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB21_2 +; GFX7-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void @@ -3409,27 +4370,61 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB22_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB22_6 +; GFX12-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB22_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB22_2 +; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3437,36 +4432,95 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB22_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB22_4 +; GFX940-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] offset:2040 +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB22_2 +; GFX940-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB22_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB22_6 +; GFX11-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB22_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB22_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: .LBB22_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3475,83 +4529,238 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB22_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB22_4 +; GFX10-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB22_2 +; GFX10-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] offset:2040 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB22_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB22_4 +; GFX90A-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB22_2 +; GFX90A-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040 -; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB22_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB22_6 +; GFX908-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:2040 glc +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v3 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v2 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB22_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB22_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB22_2 +; GFX908-NEXT: .LBB22_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: flat_load_dword v4, v[6:7] -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB22_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB22_6 +; GFX8-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB22_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[0:1] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB22_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB22_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB22_2 +; GFX8-NEXT: .LBB22_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB22_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB22_4 +; GFX7-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB22_3: ; %atomicrmw.global ; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB22_2 +; GFX7-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -3566,27 +4775,61 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:-2048 -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB23_3 +; GFX12-NEXT: ; %bb.1: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB23_6 +; GFX12-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB23_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB23_2 +; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3594,43 +4837,96 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB23_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB23_4 +; GFX940-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB23_2 +; GFX940-NEXT: .LBB23_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: flat_load_b64 v[4:5], v[4:5] -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB23_3 +; GFX11-NEXT: ; %bb.1: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB23_6 +; GFX11-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB23_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] -; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB23_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB23_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-NEXT: .LBB23_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3639,12 +4935,43 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB23_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB23_4 +; GFX10-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB23_2 +; GFX10-NEXT: .LBB23_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -3652,77 +4979,194 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB23_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB23_4 +; GFX90A-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB23_2 +; GFX90A-NEXT: .LBB23_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0 -; GFX908-NEXT: s_mov_b64 s[4:5], vcc -; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[4:5] -; GFX908-NEXT: v_max_f64 v[8:9], v[2:3], v[2:3] -; GFX908-NEXT: v_addc_co_u32_e64 v7, vcc, -1, v1, s[4:5] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB23_3 +; GFX908-NEXT: ; %bb.1: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB23_6 +; GFX908-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB23_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] -; GFX908-NEXT: v_min_f64 v[2:3], v[0:1], v[8:9] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[2:5] glc +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] -; GFX908-NEXT: v_mov_b32_e32 v5, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB23_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB23_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB23_2 +; GFX908-NEXT: .LBB23_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v5, v[0:1] -; GFX8-NEXT: flat_load_dword v4, v[6:7] -; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB23_3 +; GFX8-NEXT: ; %bb.1: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB23_6 +; GFX8-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB23_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[6:7] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB23_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] -; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[0:1] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v5, v3 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB23_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB23_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB23_2 +; GFX8-NEXT: .LBB23_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB23_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB23_4 +; GFX7-NEXT: .LBB23_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB23_3: ; %atomicrmw.global ; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB23_2 +; GFX7-NEXT: .LBB23_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -3737,204 +5181,416 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execz .LBB24_4 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB24_2 +; GFX12-NEXT: ; %bb.3: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: .LBB24_4: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB24_6 +; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX12-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB24_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB24_4 +; GFX940-NEXT: .LBB24_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB24_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB24_2 +; GFX940-NEXT: .LBB24_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB24_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB24_2 +; GFX11-NEXT: ; %bb.3: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: .LBB24_4: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_6 +; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX11-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execz .LBB24_4 +; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX10-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB24_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB24_2 +; GFX10-NEXT: ; %bb.3: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: .LBB24_4: ; %Flow2 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB24_6 +; GFX10-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: .LBB24_6: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB24_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX90A-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB24_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB24_4: ; %Flow2 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB24_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB24_4 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v3 +; GFX908-NEXT: v_mov_b32_e32 v8, v2 +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB24_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB24_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: .LBB24_4: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB24_6 +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB24_4 +; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB24_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB24_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: .LBB24_4: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB24_6 +; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: flat_load_dword v5, v[5:6] -; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB24_4 +; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v3, v[2:3] +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX7-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX7-NEXT: v_mov_b32_e32 v9, v3 +; GFX7-NEXT: v_mov_b32_e32 v8, v2 +; GFX7-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX7-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB24_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB24_2 +; GFX7-NEXT: ; %bb.3: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: .LBB24_4: ; %Flow2 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB24_6 +; GFX7-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret double %result @@ -3948,148 +5604,382 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execz .LBB25_4 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] -; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] +; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB25_2 +; GFX12-NEXT: ; %bb.3: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: .LBB25_4: ; %Flow2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB25_6 +; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX12-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB25_3 +; GFX940-NEXT: ; %bb.1: ; %Flow +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB25_4 +; GFX940-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB25_3: ; %atomicrmw.global ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB25_2 +; GFX940-NEXT: .LBB25_4: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB25_4 +; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB25_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB25_2 +; GFX11-NEXT: ; %bb.3: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: .LBB25_4: ; %Flow2 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB25_6 +; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX11-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB25_3 +; GFX10-NEXT: ; %bb.1: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB25_4 +; GFX10-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB25_3: ; %atomicrmw.global ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB25_2 +; GFX10-NEXT: .LBB25_4: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX10-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB25_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB25_4 +; GFX90A-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB25_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB25_2 +; GFX90A-NEXT: .LBB25_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB25_4 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX908-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: v_mov_b32_e32 v9, v3 +; GFX908-NEXT: v_mov_b32_e32 v8, v2 +; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB25_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB25_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: .LBB25_4: ; %Flow2 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB25_6 +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB25_4 +; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB25_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: v_mov_b32_e32 v9, v3 +; GFX8-NEXT: v_mov_b32_e32 v8, v2 +; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] +; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB25_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB25_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: .LBB25_4: ; %Flow2 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB25_6 +; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: .LBB25_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-NEXT: v_mov_b32_e32 v4, v0 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB25_3 +; GFX7-NEXT: ; %bb.1: ; %Flow +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB25_4 +; GFX7-NEXT: .LBB25_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB25_3: ; %atomicrmw.global +; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB25_2 +; GFX7-NEXT: .LBB25_4: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret double %result diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 9cc4f3987b320..14f75814128f1 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -3234,9 +3234,17 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execz .LBB16_4 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3248,11 +3256,27 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB16_2 +; GFX12-NEXT: ; %bb.3: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: .LBB16_4: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB16_6 +; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e64 v[0:1], v[4:5], -v[2:3] +; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX12-NEXT: .LBB16_6: ; %atomicrmw.phi ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe @@ -3261,9 +3285,16 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB16_4 +; GFX940-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] @@ -3273,21 +3304,44 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB16_2 +; GFX940-NEXT: ; %bb.3: ; %Flow +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: .LBB16_4: ; %Flow3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB16_6 +; GFX940-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[4:5], v6, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v6, v[0:1], off sc0 sc1 +; GFX940-NEXT: .LBB16_6: ; %atomicrmw.phi ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v4 ; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB16_4 +; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 @@ -3299,11 +3353,25 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB16_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB16_2 +; GFX11-NEXT: ; %bb.3: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: .LBB16_4: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB16_6 +; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX11-NEXT: .LBB16_6: ; %atomicrmw.phi ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3311,9 +3379,16 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX10-LABEL: flat_agent_atomic_fsub_ret_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execz .LBB16_4 +; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v7, v5 @@ -3325,10 +3400,28 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB16_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB16_2 +; GFX10-NEXT: ; %bb.3: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: .LBB16_4: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB16_6 +; GFX10-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX10-NEXT: .LBB16_6: ; %atomicrmw.phi +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v1, v5 @@ -3337,9 +3430,16 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB16_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] @@ -3348,21 +3448,45 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB16_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB16_4: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB16_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB16_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_f64: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB16_4 +; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v7, v5 @@ -3372,24 +3496,50 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB16_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB16_2 +; GFX908-NEXT: ; %bb.3: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: .LBB16_4: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB16_6 +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX908-NEXT: .LBB16_6: ; %atomicrmw.phi ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v4 ; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_ret_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB16_4 +; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[4:5] ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v7, v5 @@ -3399,24 +3549,51 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB16_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB16_2 +; GFX8-NEXT: ; %bb.3: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: .LBB16_4: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB16_6 +; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 +; GFX8-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX8-NEXT: .LBB16_6: ; %atomicrmw.phi ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_ret_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB16_4 +; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v5, v[4:5] ; GFX7-NEXT: flat_load_dword v4, v[0:1] -; GFX7-NEXT: flat_load_dword v5, v[5:6] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v7, v5 @@ -3426,13 +3603,31 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB16_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB16_2 +; GFX7-NEXT: ; %bb.3: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: .LBB16_4: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB16_6 +; GFX7-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 +; GFX7-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v5, v7, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen +; GFX7-NEXT: .LBB16_6: ; %atomicrmw.phi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub ptr %ptr, double %val syncscope("agent") seq_cst ret double %result @@ -3446,78 +3641,164 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB17_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB17_6 +; GFX12-NEXT: .LBB17_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB17_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: v_add_f64_e64 v[6:7], v[8:9], -v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB17_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB17_2 +; GFX12-NEXT: .LBB17_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e64 v[2:3], v[0:1], -v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB17_3 +; GFX940-NEXT: ; %bb.1: ; %Flow3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB17_6 +; GFX940-NEXT: .LBB17_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB17_3: ; %atomicrmw.global +; GFX940-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[0:1] +; GFX940-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 sc0 +; GFX940-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB17_4 +; GFX940-NEXT: ; %bb.5: ; %Flow +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB17_2 +; GFX940-NEXT: .LBB17_6: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB17_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB17_6 +; GFX11-NEXT: .LBB17_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB17_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB17_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB17_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: .LBB17_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: @@ -3525,9 +3806,22 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB17_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB17_6 +; GFX10-NEXT: .LBB17_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB17_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v9, v1 @@ -3539,71 +3833,158 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB17_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB17_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB17_2 +; GFX10-NEXT: .LBB17_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_6 +; GFX90A-NEXT: .LBB17_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_4 +; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB17_2 +; GFX90A-NEXT: .LBB17_6: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB17_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB17_6 +; GFX908-NEXT: .LBB17_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB17_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX908-NEXT: v_mov_b32_e32 v9, v1 +; GFX908-NEXT: v_mov_b32_e32 v8, v0 +; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB17_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB17_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB17_2 +; GFX908-NEXT: .LBB17_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v0, v4 -; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB17_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB17_6 +; GFX8-NEXT: .LBB17_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB17_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3613,24 +3994,55 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB17_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB17_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB17_2 +; GFX8-NEXT: .LBB17_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7f8, v0 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB17_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB17_6 +; GFX7-NEXT: .LBB17_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB17_3: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -3640,11 +4052,27 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB17_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB17_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB17_2 +; GFX7-NEXT: .LBB17_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %result = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst @@ -3659,44 +4087,84 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:-2048 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB18_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB18_6 +; GFX12-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] +; GFX12-NEXT: v_add_f64_e64 v[6:7], v[8:9], -v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB18_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB18_2 +; GFX12-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e64 v[2:3], v[0:1], -v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 ; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX940-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB18_3 +; GFX940-NEXT: ; %bb.1: ; %Flow3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB18_6 +; GFX940-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX940-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[0:1] @@ -3706,41 +4174,77 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB18_4 +; GFX940-NEXT: ; %bb.5: ; %Flow +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB18_2 +; GFX940-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[2:3], off sc0 sc1 ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo -; GFX11-NEXT: v_add_co_u32 v5, vcc_lo, 0xfffff800, v5 -; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, -1, v4, vcc_lo -; GFX11-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB18_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB18_6 +; GFX11-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], -v[2:3] +; GFX11-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[5:6], v[7:10] glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB18_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB18_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3749,9 +4253,22 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB18_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB18_6 +; GFX10-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5] -; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v9, v1 @@ -3763,10 +4280,26 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB18_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB18_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB18_2 +; GFX10-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3775,11 +4308,22 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_6 +; GFX90A-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] @@ -3788,11 +4332,26 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 +; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB18_2 +; GFX90A-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: @@ -3800,11 +4359,22 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_6 +; GFX908-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v9, v1 @@ -3814,24 +4384,54 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB18_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB18_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB18_2 +; GFX908-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX908-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB18_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB18_6 +; GFX8-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: flat_load_dword v0, v[4:5] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v9, v1 @@ -3841,24 +4441,55 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB18_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB18_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB18_2 +; GFX8-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX8-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB18_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB18_6 +; GFX7-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GFX7-NEXT: flat_load_dword v1, v[0:1] ; GFX7-NEXT: flat_load_dword v0, v[4:5] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v9, v1 @@ -3868,11 +4499,27 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB18_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB18_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB18_2 +; GFX7-NEXT: .LBB18_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] +; GFX7-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %result = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst @@ -3887,9 +4534,25 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_6 +; GFX12-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] @@ -3900,11 +4563,24 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB19_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB19_2 +; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3912,9 +4588,21 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB19_3 +; GFX940-NEXT: ; %bb.1: ; %Flow3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB19_6 +; GFX940-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX940-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -3923,20 +4611,46 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB19_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB19_4 +; GFX940-NEXT: ; %bb.5: ; %Flow +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB19_2 +; GFX940-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_6 +; GFX11-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -3947,20 +4661,44 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB19_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB19_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fsub_noret_f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB19_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB19_6 +; GFX10-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -3972,19 +4710,47 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB19_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB19_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB19_2 +; GFX10-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_6 +; GFX90A-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -3992,20 +4758,47 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_4 +; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB19_2 +; GFX90A-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_f64: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_6 +; GFX908-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4014,23 +4807,52 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB19_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB19_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB19_2 +; GFX908-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB19_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB19_6 +; GFX8-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v6, v[0:1] ; GFX8-NEXT: flat_load_dword v7, v[4:5] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4039,23 +4861,53 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX8-NEXT: buffer_wbinvl1 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v7, v5 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v6, v4 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB19_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB19_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB19_2 +; GFX8-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB19_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB19_6 +; GFX7-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v6, v[0:1] ; GFX7-NEXT: flat_load_dword v7, v[4:5] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB19_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4064,12 +4916,28 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 { ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX7-NEXT: v_mov_b32_e32 v7, v5 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7-NEXT: v_mov_b32_e32 v6, v4 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB19_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB19_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB19_2 +; GFX7-NEXT: .LBB19_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %unused = atomicrmw fsub ptr %ptr, double %val syncscope("agent") seq_cst ret void @@ -4083,24 +4951,56 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] offset:2040 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB20_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB20_6 +; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB20_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB20_2 +; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4108,46 +5008,101 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:2040 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 +; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB20_3 +; GFX940-NEXT: ; %bb.1: ; %Flow3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB20_6 +; GFX940-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX940-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 sc0 +; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB20_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB20_4 +; GFX940-NEXT: ; %bb.5: ; %Flow +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB20_2 +; GFX940-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] offset:2040 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB20_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB20_6 +; GFX11-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB20_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB20_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4156,9 +5111,21 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB20_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB20_6 +; GFX10-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4170,108 +5137,242 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB20_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB20_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB20_2 +; GFX10-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:2040 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_6 +; GFX90A-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_4 +; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB20_2 +; GFX90A-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:2040 -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB20_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB20_6 +; GFX908-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX908-NEXT: v_mov_b32_e32 v7, v5 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v6, v4 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB20_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB20_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB20_2 +; GFX908-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7f8, v0 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v7, v[0:1] -; GFX8-NEXT: flat_load_dword v6, v[8:9] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB20_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB20_6 +; GFX8-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB20_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB20_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB20_2 +; GFX8-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x7f8, v0 -; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dword v7, v[0:1] -; GFX7-NEXT: flat_load_dword v6, v[8:9] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB20_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB20_6 +; GFX7-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB20_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB20_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB20_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB20_2 +; GFX7-NEXT: .LBB20_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 255 %unused = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst @@ -4286,24 +5387,56 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] offset:-2048 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB21_3 +; GFX12-NEXT: ; %bb.1: ; %Flow3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB21_6 +; GFX12-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3] ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB21_4 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execz .LBB21_2 +; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX12-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f64_e64 v[0:1], v[0:1], -v[2:3] +; GFX12-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -4311,15 +5444,24 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dwordx2 v[6:7], v[4:5] ; GFX940-NEXT: s_mov_b32 s1, -1 ; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB21_3 +; GFX940-NEXT: ; %bb.1: ; %Flow3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB21_6 +; GFX940-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX940-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX940-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4328,24 +5470,49 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB21_4 +; GFX940-NEXT: ; %bb.5: ; %Flow +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX940-NEXT: s_cbranch_execz .LBB21_2 +; GFX940-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX11-NEXT: flat_load_b64 v[6:7], v[4:5] -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB21_6 +; GFX11-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4356,11 +5523,23 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execnz .LBB21_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX11-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX11-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4369,9 +5548,21 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 +; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB21_3 +; GFX10-NEXT: ; %bb.1: ; %Flow3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB21_6 +; GFX10-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] @@ -4383,116 +5574,242 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 -; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: s_cbranch_execnz .LBB21_1 -; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB21_4 +; GFX10-NEXT: ; %bb.5: ; %Flow +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 +; GFX10-NEXT: s_cbranch_execz .LBB21_2 +; GFX10-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, 0xfffff800, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v1, vcc ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB21_6 +; GFX90A-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 +; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB21_2 +; GFX90A-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_add_co_u32_e32 v8, vcc, 0xfffff800, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v1, vcc ; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB21_3 +; GFX908-NEXT: ; %bb.1: ; %Flow3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB21_6 +; GFX908-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GFX908-NEXT: s_mov_b64 s[4:5], 0 -; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: buffer_wbinvl1 -; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v7, v1 -; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v6, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX908-NEXT: s_cbranch_execnz .LBB21_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB21_4 +; GFX908-NEXT: ; %bb.5: ; %Flow +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB21_2 +; GFX908-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX908-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX908-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX908-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xfffff800, v0 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX8-NEXT: flat_load_dword v7, v[0:1] -; GFX8-NEXT: flat_load_dword v6, v[8:9] -; GFX8-NEXT: s_mov_b64 s[4:5], 0 -; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB21_3 +; GFX8-NEXT: ; %bb.1: ; %Flow3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB21_6 +; GFX8-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1 -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v7, v1 -; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8-NEXT: v_mov_b32_e32 v6, v0 -; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX8-NEXT: s_cbranch_execnz .LBB21_1 -; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB21_4 +; GFX8-NEXT: ; %bb.5: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB21_2 +; GFX8-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GFX8-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0xfffff800, v0 -; GFX7-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc -; GFX7-NEXT: flat_load_dword v7, v[0:1] -; GFX7-NEXT: flat_load_dword v6, v[8:9] -; GFX7-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB21_3 +; GFX7-NEXT: ; %bb.1: ; %Flow3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB21_6 +; GFX7-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; GFX7-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: s_mov_b64 s[6:7], 0 +; GFX7-NEXT: .LBB21_4: ; %atomicrmw.start ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v7, v1 -; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v6, v0 -; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX7-NEXT: s_cbranch_execnz .LBB21_1 -; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX7-NEXT: s_cbranch_execnz .LBB21_4 +; GFX7-NEXT: ; %bb.5: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB21_2 +; GFX7-NEXT: .LBB21_6: ; %atomicrmw.private +; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %ptr, i64 -256 %unused = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index eded1ee04625b..839f4a18508e5 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -6,43 +6,136 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_add_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB0_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB0_4 +; GCN1-NEXT: .LBB0_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB0_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB0_2 +; GCN1-NEXT: .LBB0_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB0_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB0_4 +; GCN2-NEXT: .LBB0_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB0_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB0_2 +; GCN2-NEXT: .LBB0_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB0_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB0_4 +; GFX12-NEXT: .LBB0_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB0_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB0_2 +; GFX12-NEXT: .LBB0_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -53,39 +146,104 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_add_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB1_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB1_3 +; GCN1-NEXT: s_branch .LBB1_4 +; GCN1-NEXT: .LBB1_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB1_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v5, vcc, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB1_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB1_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB1_3 +; GCN2-NEXT: s_branch .LBB1_4 +; GCN2-NEXT: .LBB1_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB1_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB1_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -94,12 +252,34 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB1_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB1_3 +; GFX12-NEXT: s_branch .LBB1_4 +; GFX12-NEXT: .LBB1_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB1_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB1_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -113,40 +293,109 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB2_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB2_4 +; GCN1-NEXT: .LBB2_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB2_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB2_2 +; GCN1-NEXT: .LBB2_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB2_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB2_4 +; GCN2-NEXT: .LBB2_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB2_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB2_2 +; GCN2-NEXT: .LBB2_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_addr64_offset: @@ -156,13 +405,37 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB2_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB2_4 +; GFX12-NEXT: .LBB2_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB2_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB2_2 +; GFX12-NEXT: .LBB2_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -174,43 +447,108 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB3_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB3_3 +; GCN1-NEXT: s_branch .LBB3_4 +; GCN1-NEXT: .LBB3_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB3_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v5, vcc, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB3_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB3_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB3_3 +; GCN2-NEXT: s_branch .LBB3_4 +; GCN2-NEXT: .LBB3_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB3_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v5, vcc, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB3_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -219,13 +557,35 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB3_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB3_3 +; GFX12-NEXT: s_branch .LBB3_4 +; GFX12-NEXT: .LBB3_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB3_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s4 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB3_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -240,39 +600,130 @@ entry: define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_add_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB4_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB4_4 +; GCN1-NEXT: .LBB4_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB4_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB4_2 +; GCN1-NEXT: .LBB4_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB4_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB4_4 +; GCN2-NEXT: .LBB4_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB4_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB4_2 +; GCN2-NEXT: .LBB4_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB4_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB4_4 +; GFX12-NEXT: .LBB4_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB4_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB4_2 +; GFX12-NEXT: .LBB4_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst @@ -282,16 +733,49 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_add_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB5_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB5_3 +; GCN1-NEXT: s_branch .LBB5_4 +; GCN1-NEXT: .LBB5_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB5_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v5, vcc, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB5_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -299,16 +783,48 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_add_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB5_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB5_3 +; GCN2-NEXT: s_branch .LBB5_4 +; GCN2-NEXT: .LBB5_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB5_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB5_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -319,12 +835,32 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB5_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB5_3 +; GFX12-NEXT: s_branch .LBB5_4 +; GFX12-NEXT: .LBB5_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB5_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB5_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -337,36 +873,105 @@ entry: define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB6_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB6_4 +; GCN1-NEXT: .LBB6_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB6_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB6_2 +; GCN1-NEXT: .LBB6_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB6_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB6_4 +; GCN2-NEXT: .LBB6_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB6_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB6_2 +; GCN2-NEXT: .LBB6_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_addr64: @@ -374,15 +979,38 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB6_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB6_4 +; GFX12-NEXT: .LBB6_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB6_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB6_2 +; GFX12-NEXT: .LBB6_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -393,54 +1021,140 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_add_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB7_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB7_3 +; GCN1-NEXT: s_branch .LBB7_4 +; GCN1-NEXT: .LBB7_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB7_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v5, vcc, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB7_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB7_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB7_3 +; GCN2-NEXT: s_branch .LBB7_4 +; GCN2-NEXT: .LBB7_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB7_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v5, vcc, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB7_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_add_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB7_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB7_3 +; GFX12-NEXT: s_branch .LBB7_4 +; GFX12-NEXT: .LBB7_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB7_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, s4 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB7_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -454,43 +1168,134 @@ entry: define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_and_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB8_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB8_4 +; GCN1-NEXT: .LBB8_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB8_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB8_2 +; GCN1-NEXT: .LBB8_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB8_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB8_4 +; GCN2-NEXT: .LBB8_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB8_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB8_2 +; GCN2-NEXT: .LBB8_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB8_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB8_4 +; GFX12-NEXT: .LBB8_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB8_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB8_2 +; GFX12-NEXT: .LBB8_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -501,39 +1306,102 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_and_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB9_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB9_3 +; GCN1-NEXT: s_branch .LBB9_4 +; GCN1-NEXT: .LBB9_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB9_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v4, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, s1, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB9_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB9_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB9_3 +; GCN2-NEXT: s_branch .LBB9_4 +; GCN2-NEXT: .LBB9_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB9_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v4, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, s1, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB9_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -542,12 +1410,34 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB9_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB9_3 +; GFX12-NEXT: s_branch .LBB9_4 +; GFX12-NEXT: .LBB9_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB9_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX12-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB9_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -561,40 +1451,107 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB10_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB10_4 +; GCN1-NEXT: .LBB10_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB10_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB10_2 +; GCN1-NEXT: .LBB10_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB10_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB10_4 +; GCN2-NEXT: .LBB10_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB10_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB10_2 +; GCN2-NEXT: .LBB10_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_addr64_offset: @@ -604,13 +1561,37 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB10_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB10_4 +; GFX12-NEXT: .LBB10_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB10_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB10_2 +; GFX12-NEXT: .LBB10_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -622,43 +1603,106 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB11_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB11_3 +; GCN1-NEXT: s_branch .LBB11_4 +; GCN1-NEXT: .LBB11_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB11_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v4, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, s9, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB11_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB11_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB11_3 +; GCN2-NEXT: s_branch .LBB11_4 +; GCN2-NEXT: .LBB11_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB11_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v4, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, s9, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB11_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -667,13 +1711,35 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB11_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB11_3 +; GFX12-NEXT: s_branch .LBB11_4 +; GFX12-NEXT: .LBB11_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB11_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v3, s5, v1 +; GFX12-NEXT: v_and_b32_e32 v2, s4, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB11_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -688,39 +1754,128 @@ entry: define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_and_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB12_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB12_4 +; GCN1-NEXT: .LBB12_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB12_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB12_2 +; GCN1-NEXT: .LBB12_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB12_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB12_4 +; GCN2-NEXT: .LBB12_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB12_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB12_2 +; GCN2-NEXT: .LBB12_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB12_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB12_4 +; GFX12-NEXT: .LBB12_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB12_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB12_2 +; GFX12-NEXT: .LBB12_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst @@ -730,16 +1885,48 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_and_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB13_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB13_3 +; GCN1-NEXT: s_branch .LBB13_4 +; GCN1-NEXT: .LBB13_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB13_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v4, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, s1, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB13_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -747,16 +1934,47 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_and_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB13_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB13_3 +; GCN2-NEXT: s_branch .LBB13_4 +; GCN2-NEXT: .LBB13_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB13_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v4, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, s1, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB13_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -767,12 +1985,32 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB13_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB13_3 +; GFX12-NEXT: s_branch .LBB13_4 +; GFX12-NEXT: .LBB13_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB13_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX12-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB13_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -785,36 +2023,103 @@ entry: define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB14_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB14_4 +; GCN1-NEXT: .LBB14_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB14_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB14_2 +; GCN1-NEXT: .LBB14_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB14_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB14_4 +; GCN2-NEXT: .LBB14_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB14_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB14_2 +; GCN2-NEXT: .LBB14_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_addr64: @@ -822,15 +2127,38 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB14_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB14_4 +; GFX12-NEXT: .LBB14_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB14_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB14_2 +; GFX12-NEXT: .LBB14_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -841,54 +2169,138 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_and_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB15_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB15_3 +; GCN1-NEXT: s_branch .LBB15_4 +; GCN1-NEXT: .LBB15_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB15_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v4, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, s9, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB15_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB15_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB15_3 +; GCN2-NEXT: s_branch .LBB15_4 +; GCN2-NEXT: .LBB15_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB15_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v4, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, s9, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB15_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_and_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB15_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB15_3 +; GFX12-NEXT: s_branch .LBB15_4 +; GFX12-NEXT: .LBB15_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB15_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v3, s5, v1 +; GFX12-NEXT: v_and_b32_e32 v2, s4, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB15_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -902,43 +2314,136 @@ entry: define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_sub_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB16_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB16_4 +; GCN1-NEXT: .LBB16_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB16_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB16_2 +; GCN1-NEXT: .LBB16_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB16_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB16_4 +; GCN2-NEXT: .LBB16_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB16_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB16_2 +; GCN2-NEXT: .LBB16_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB16_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB16_4 +; GFX12-NEXT: .LBB16_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB16_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB16_2 +; GFX12-NEXT: .LBB16_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2 +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -949,39 +2454,104 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_sub_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB17_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB17_3 +; GCN1-NEXT: s_branch .LBB17_4 +; GCN1-NEXT: .LBB17_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB17_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB17_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB17_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB17_3 +; GCN2-NEXT: s_branch .LBB17_4 +; GCN2-NEXT: .LBB17_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB17_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB17_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -990,12 +2560,34 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB17_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB17_3 +; GFX12-NEXT: s_branch .LBB17_4 +; GFX12-NEXT: .LBB17_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB17_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB17_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -1009,40 +2601,109 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB18_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB18_4 +; GCN1-NEXT: .LBB18_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB18_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB18_2 +; GCN1-NEXT: .LBB18_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB18_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB18_4 +; GCN2-NEXT: .LBB18_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB18_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB18_2 +; GCN2-NEXT: .LBB18_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_addr64_offset: @@ -1052,13 +2713,37 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB18_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB18_4 +; GFX12-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB18_2 +; GFX12-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2 +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -1070,43 +2755,108 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB19_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB19_3 +; GCN1-NEXT: s_branch .LBB19_4 +; GCN1-NEXT: .LBB19_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB19_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB19_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB19_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB19_3 +; GCN2-NEXT: s_branch .LBB19_4 +; GCN2-NEXT: .LBB19_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB19_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB19_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -1115,13 +2865,35 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB19_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB19_3 +; GFX12-NEXT: s_branch .LBB19_4 +; GFX12-NEXT: .LBB19_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB19_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s4 +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB19_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -1136,39 +2908,130 @@ entry: define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_sub_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB20_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB20_4 +; GCN1-NEXT: .LBB20_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB20_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB20_2 +; GCN1-NEXT: .LBB20_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB20_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB20_4 +; GCN2-NEXT: .LBB20_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB20_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB20_2 +; GCN2-NEXT: .LBB20_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB20_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB20_4 +; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB20_2 +; GFX12-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2 +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst @@ -1178,16 +3041,49 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_sub_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB21_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB21_3 +; GCN1-NEXT: s_branch .LBB21_4 +; GCN1-NEXT: .LBB21_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB21_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB21_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1195,16 +3091,48 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_sub_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB21_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB21_3 +; GCN2-NEXT: s_branch .LBB21_4 +; GCN2-NEXT: .LBB21_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB21_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB21_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1215,12 +3143,32 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB21_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB21_3 +; GFX12-NEXT: s_branch .LBB21_4 +; GFX12-NEXT: .LBB21_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB21_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB21_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -1233,36 +3181,105 @@ entry: define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB22_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB22_4 +; GCN1-NEXT: .LBB22_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB22_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB22_2 +; GCN1-NEXT: .LBB22_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: buffer_load_dword v3, v2, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB22_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB22_4 +; GCN2-NEXT: .LBB22_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB22_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB22_2 +; GCN2-NEXT: .LBB22_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: buffer_load_dword v3, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_addr64: @@ -1270,15 +3287,38 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB22_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB22_4 +; GFX12-NEXT: .LBB22_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB22_2 +; GFX12-NEXT: .LBB22_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2 +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -1289,54 +3329,140 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_sub_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB23_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB23_3 +; GCN1-NEXT: s_branch .LBB23_4 +; GCN1-NEXT: .LBB23_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB23_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB23_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB23_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB23_3 +; GCN2-NEXT: s_branch .LBB23_4 +; GCN2-NEXT: .LBB23_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB23_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB23_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_sub_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB23_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB23_3 +; GFX12-NEXT: s_branch .LBB23_4 +; GFX12-NEXT: .LBB23_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB23_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s4 +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB23_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -1350,41 +3476,137 @@ entry: define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_max_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB24_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB24_4 +; GCN1-NEXT: .LBB24_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB24_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_cbranch_execnz .LBB24_2 +; GCN1-NEXT: .LBB24_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB24_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB24_4 +; GCN2-NEXT: .LBB24_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB24_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: s_cbranch_execnz .LBB24_2 +; GCN2-NEXT: .LBB24_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB24_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB24_4 +; GFX12-NEXT: .LBB24_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB24_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB24_2 +; GFX12-NEXT: .LBB24_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -1395,38 +3617,105 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_max_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB25_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execz .LBB25_3 +; GCN1-NEXT: s_branch .LBB25_4 +; GCN1-NEXT: .LBB25_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB25_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB25_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB25_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execz .LBB25_3 +; GCN2-NEXT: s_branch .LBB25_4 +; GCN2-NEXT: .LBB25_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB25_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB25_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -1436,12 +3725,35 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB25_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB25_3 +; GFX12-NEXT: s_branch .LBB25_4 +; GFX12-NEXT: .LBB25_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB25_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB25_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -1455,38 +3767,109 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB26_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB26_4 +; GCN1-NEXT: .LBB26_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB26_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB26_2 +; GCN1-NEXT: .LBB26_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB26_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB26_4 +; GCN2-NEXT: .LBB26_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB26_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB26_2 +; GCN2-NEXT: .LBB26_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_addr64_offset: @@ -1496,13 +3879,38 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB26_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB26_4 +; GFX12-NEXT: .LBB26_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB26_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB26_2 +; GFX12-NEXT: .LBB26_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -1514,42 +3922,109 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB27_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB27_3 +; GCN1-NEXT: s_branch .LBB27_4 +; GCN1-NEXT: .LBB27_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB27_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB27_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB27_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB27_3 +; GCN2-NEXT: s_branch .LBB27_4 +; GCN2-NEXT: .LBB27_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB27_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB27_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -1559,13 +4034,36 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB27_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB27_3 +; GFX12-NEXT: s_branch .LBB27_4 +; GFX12-NEXT: .LBB27_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB27_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB27_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -1580,37 +4078,131 @@ entry: define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_max_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB28_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB28_4 +; GCN1-NEXT: .LBB28_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB28_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB28_2 +; GCN1-NEXT: .LBB28_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB28_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB28_4 +; GCN2-NEXT: .LBB28_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB28_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB28_2 +; GCN2-NEXT: .LBB28_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB28_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB28_4 +; GFX12-NEXT: .LBB28_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB28_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB28_2 +; GFX12-NEXT: .LBB28_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst @@ -1620,15 +4212,49 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_max_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB29_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execz .LBB29_3 +; GCN1-NEXT: s_branch .LBB29_4 +; GCN1-NEXT: .LBB29_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB29_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB29_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) @@ -1637,15 +4263,48 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_max_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB29_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execz .LBB29_3 +; GCN2-NEXT: s_branch .LBB29_4 +; GCN2-NEXT: .LBB29_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB29_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB29_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -1657,12 +4316,33 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB29_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB29_3 +; GFX12-NEXT: s_branch .LBB29_4 +; GFX12-NEXT: .LBB29_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB29_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB29_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -1675,34 +4355,105 @@ entry: define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB30_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB30_4 +; GCN1-NEXT: .LBB30_2: ; %atomicrmw.phi ; GCN1-NEXT: s_endpgm -; +; GCN1-NEXT: .LBB30_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB30_2 +; GCN1-NEXT: .LBB30_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_endpgm +; ; GCN2-LABEL: atomic_max_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB30_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB30_4 +; GCN2-NEXT: .LBB30_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB30_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB30_2 +; GCN2-NEXT: .LBB30_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_max_i64_addr64: @@ -1710,15 +4461,39 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB30_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB30_4 +; GFX12-NEXT: .LBB30_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB30_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB30_2 +; GFX12-NEXT: .LBB30_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -1729,38 +4504,105 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_max_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB31_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB31_3 +; GCN1-NEXT: s_branch .LBB31_4 +; GCN1-NEXT: .LBB31_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB31_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB31_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB31_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB31_3 +; GCN2-NEXT: s_branch .LBB31_4 +; GCN2-NEXT: .LBB31_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB31_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB31_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -1768,15 +4610,37 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_max_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB31_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB31_3 +; GFX12-NEXT: s_branch .LBB31_4 +; GFX12-NEXT: .LBB31_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB31_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB31_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -1790,41 +4654,137 @@ entry: define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umax_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB32_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB32_4 +; GCN1-NEXT: .LBB32_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB32_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_cbranch_execnz .LBB32_2 +; GCN1-NEXT: .LBB32_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB32_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB32_4 +; GCN2-NEXT: .LBB32_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB32_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: s_cbranch_execnz .LBB32_2 +; GCN2-NEXT: .LBB32_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB32_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB32_4 +; GFX12-NEXT: .LBB32_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB32_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB32_2 +; GFX12-NEXT: .LBB32_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -1835,38 +4795,105 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umax_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB33_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execz .LBB33_3 +; GCN1-NEXT: s_branch .LBB33_4 +; GCN1-NEXT: .LBB33_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB33_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB33_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB33_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execz .LBB33_3 +; GCN2-NEXT: s_branch .LBB33_4 +; GCN2-NEXT: .LBB33_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB33_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB33_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -1876,12 +4903,35 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB33_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB33_3 +; GFX12-NEXT: s_branch .LBB33_4 +; GFX12-NEXT: .LBB33_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB33_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB33_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -1895,38 +4945,109 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB34_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB34_4 +; GCN1-NEXT: .LBB34_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB34_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB34_2 +; GCN1-NEXT: .LBB34_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB34_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB34_4 +; GCN2-NEXT: .LBB34_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB34_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB34_2 +; GCN2-NEXT: .LBB34_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_addr64_offset: @@ -1936,13 +5057,38 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB34_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB34_4 +; GFX12-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB34_2 +; GFX12-NEXT: .LBB34_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -1954,42 +5100,109 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB35_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB35_3 +; GCN1-NEXT: s_branch .LBB35_4 +; GCN1-NEXT: .LBB35_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB35_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB35_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB35_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB35_3 +; GCN2-NEXT: s_branch .LBB35_4 +; GCN2-NEXT: .LBB35_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB35_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB35_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -1999,13 +5212,36 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB35_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB35_3 +; GFX12-NEXT: s_branch .LBB35_4 +; GFX12-NEXT: .LBB35_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB35_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB35_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2020,37 +5256,131 @@ entry: define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umax_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB36_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB36_4 +; GCN1-NEXT: .LBB36_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB36_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB36_2 +; GCN1-NEXT: .LBB36_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB36_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB36_4 +; GCN2-NEXT: .LBB36_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB36_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB36_2 +; GCN2-NEXT: .LBB36_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB36_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB36_4 +; GFX12-NEXT: .LBB36_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB36_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB36_2 +; GFX12-NEXT: .LBB36_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst @@ -2060,15 +5390,49 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umax_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB37_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execz .LBB37_3 +; GCN1-NEXT: s_branch .LBB37_4 +; GCN1-NEXT: .LBB37_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB37_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB37_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) @@ -2077,15 +5441,48 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_umax_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB37_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execz .LBB37_3 +; GCN2-NEXT: s_branch .LBB37_4 +; GCN2-NEXT: .LBB37_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB37_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB37_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -2097,12 +5494,33 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB37_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB37_3 +; GFX12-NEXT: s_branch .LBB37_4 +; GFX12-NEXT: .LBB37_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB37_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB37_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2115,34 +5533,105 @@ entry: define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB38_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB38_4 +; GCN1-NEXT: .LBB38_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB38_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB38_2 +; GCN1-NEXT: .LBB38_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB38_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB38_4 +; GCN2-NEXT: .LBB38_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB38_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB38_2 +; GCN2-NEXT: .LBB38_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umax_i64_addr64: @@ -2150,15 +5639,39 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB38_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB38_4 +; GFX12-NEXT: .LBB38_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB38_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB38_2 +; GFX12-NEXT: .LBB38_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -2169,38 +5682,105 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umax_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB39_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB39_3 +; GCN1-NEXT: s_branch .LBB39_4 +; GCN1-NEXT: .LBB39_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB39_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB39_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB39_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB39_3 +; GCN2-NEXT: s_branch .LBB39_4 +; GCN2-NEXT: .LBB39_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB39_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB39_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -2208,15 +5788,37 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umax_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB39_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB39_3 +; GFX12-NEXT: s_branch .LBB39_4 +; GFX12-NEXT: .LBB39_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB39_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB39_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2230,41 +5832,137 @@ entry: define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_min_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB40_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB40_4 +; GCN1-NEXT: .LBB40_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB40_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_cbranch_execnz .LBB40_2 +; GCN1-NEXT: .LBB40_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB40_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB40_4 +; GCN2-NEXT: .LBB40_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB40_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: s_cbranch_execnz .LBB40_2 +; GCN2-NEXT: .LBB40_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB40_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB40_4 +; GFX12-NEXT: .LBB40_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB40_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB40_2 +; GFX12-NEXT: .LBB40_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -2275,38 +5973,105 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_min_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB41_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execz .LBB41_3 +; GCN1-NEXT: s_branch .LBB41_4 +; GCN1-NEXT: .LBB41_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB41_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB41_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB41_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execz .LBB41_3 +; GCN2-NEXT: s_branch .LBB41_4 +; GCN2-NEXT: .LBB41_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB41_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB41_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -2316,12 +6081,35 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB41_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB41_3 +; GFX12-NEXT: s_branch .LBB41_4 +; GFX12-NEXT: .LBB41_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB41_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB41_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2335,38 +6123,109 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB42_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB42_4 +; GCN1-NEXT: .LBB42_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB42_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB42_2 +; GCN1-NEXT: .LBB42_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB42_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB42_4 +; GCN2-NEXT: .LBB42_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB42_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB42_2 +; GCN2-NEXT: .LBB42_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_addr64_offset: @@ -2376,13 +6235,38 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB42_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB42_4 +; GFX12-NEXT: .LBB42_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB42_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB42_2 +; GFX12-NEXT: .LBB42_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -2394,42 +6278,109 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB43_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB43_3 +; GCN1-NEXT: s_branch .LBB43_4 +; GCN1-NEXT: .LBB43_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB43_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB43_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB43_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB43_3 +; GCN2-NEXT: s_branch .LBB43_4 +; GCN2-NEXT: .LBB43_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB43_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB43_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -2439,13 +6390,36 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB43_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB43_3 +; GFX12-NEXT: s_branch .LBB43_4 +; GFX12-NEXT: .LBB43_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB43_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB43_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2460,37 +6434,131 @@ entry: define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_min_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB44_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB44_4 +; GCN1-NEXT: .LBB44_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB44_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB44_2 +; GCN1-NEXT: .LBB44_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB44_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB44_4 +; GCN2-NEXT: .LBB44_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB44_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB44_2 +; GCN2-NEXT: .LBB44_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB44_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB44_4 +; GFX12-NEXT: .LBB44_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB44_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB44_2 +; GFX12-NEXT: .LBB44_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst @@ -2500,15 +6568,49 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_min_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB45_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execz .LBB45_3 +; GCN1-NEXT: s_branch .LBB45_4 +; GCN1-NEXT: .LBB45_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB45_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB45_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) @@ -2517,15 +6619,48 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_min_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB45_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execz .LBB45_3 +; GCN2-NEXT: s_branch .LBB45_4 +; GCN2-NEXT: .LBB45_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB45_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB45_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -2537,12 +6672,33 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB45_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB45_3 +; GFX12-NEXT: s_branch .LBB45_4 +; GFX12-NEXT: .LBB45_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB45_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB45_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2555,34 +6711,105 @@ entry: define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB46_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB46_4 +; GCN1-NEXT: .LBB46_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB46_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB46_2 +; GCN1-NEXT: .LBB46_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB46_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB46_4 +; GCN2-NEXT: .LBB46_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB46_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB46_2 +; GCN2-NEXT: .LBB46_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_min_i64_addr64: @@ -2590,15 +6817,39 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB46_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB46_4 +; GFX12-NEXT: .LBB46_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB46_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB46_2 +; GFX12-NEXT: .LBB46_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -2609,38 +6860,105 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_min_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB47_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB47_3 +; GCN1-NEXT: s_branch .LBB47_4 +; GCN1-NEXT: .LBB47_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB47_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB47_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB47_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB47_3 +; GCN2-NEXT: s_branch .LBB47_4 +; GCN2-NEXT: .LBB47_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB47_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB47_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -2648,15 +6966,37 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-LABEL: atomic_min_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB47_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB47_3 +; GFX12-NEXT: s_branch .LBB47_4 +; GFX12-NEXT: .LBB47_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB47_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB47_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2670,41 +7010,137 @@ entry: define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umin_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB48_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB48_4 +; GCN1-NEXT: .LBB48_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB48_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: s_cbranch_execnz .LBB48_2 +; GCN1-NEXT: .LBB48_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB48_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB48_4 +; GCN2-NEXT: .LBB48_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB48_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: s_cbranch_execnz .LBB48_2 +; GCN2-NEXT: .LBB48_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB48_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB48_4 +; GFX12-NEXT: .LBB48_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB48_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB48_2 +; GFX12-NEXT: .LBB48_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -2715,38 +7151,105 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umin_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB49_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execz .LBB49_3 +; GCN1-NEXT: s_branch .LBB49_4 +; GCN1-NEXT: .LBB49_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB49_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB49_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB49_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execz .LBB49_3 +; GCN2-NEXT: s_branch .LBB49_4 +; GCN2-NEXT: .LBB49_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB49_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB49_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -2756,12 +7259,35 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB49_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB49_3 +; GFX12-NEXT: s_branch .LBB49_4 +; GFX12-NEXT: .LBB49_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB49_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB49_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2775,38 +7301,109 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB50_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB50_4 +; GCN1-NEXT: .LBB50_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB50_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB50_2 +; GCN1-NEXT: .LBB50_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB50_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB50_4 +; GCN2-NEXT: .LBB50_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB50_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB50_2 +; GCN2-NEXT: .LBB50_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_addr64_offset: @@ -2816,13 +7413,38 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB50_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB50_4 +; GFX12-NEXT: .LBB50_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB50_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB50_2 +; GFX12-NEXT: .LBB50_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -2834,42 +7456,109 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB51_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB51_3 +; GCN1-NEXT: s_branch .LBB51_4 +; GCN1-NEXT: .LBB51_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB51_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB51_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB51_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB51_3 +; GCN2-NEXT: s_branch .LBB51_4 +; GCN2-NEXT: .LBB51_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB51_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB51_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -2879,13 +7568,36 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB51_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB51_3 +; GFX12-NEXT: s_branch .LBB51_4 +; GFX12-NEXT: .LBB51_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB51_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB51_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2900,37 +7612,131 @@ entry: define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_umin_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB52_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB52_4 +; GCN1-NEXT: .LBB52_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB52_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB52_2 +; GCN1-NEXT: .LBB52_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB52_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB52_4 +; GCN2-NEXT: .LBB52_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB52_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB52_2 +; GCN2-NEXT: .LBB52_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB52_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB52_4 +; GFX12-NEXT: .LBB52_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB52_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB52_2 +; GFX12-NEXT: .LBB52_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst @@ -2940,15 +7746,49 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umin_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB53_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execz .LBB53_3 +; GCN1-NEXT: s_branch .LBB53_4 +; GCN1-NEXT: .LBB53_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB53_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB53_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) @@ -2957,15 +7797,48 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_umin_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB53_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execz .LBB53_3 +; GCN2-NEXT: s_branch .LBB53_4 +; GCN2-NEXT: .LBB53_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB53_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB53_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) @@ -2977,12 +7850,33 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB53_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB53_3 +; GFX12-NEXT: s_branch .LBB53_4 +; GFX12-NEXT: .LBB53_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB53_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s1, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s0, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB53_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -2995,34 +7889,105 @@ entry: define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB54_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB54_4 +; GCN1-NEXT: .LBB54_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB54_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cbranch_execnz .LBB54_2 +; GCN1-NEXT: .LBB54_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB54_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB54_4 +; GCN2-NEXT: .LBB54_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB54_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cbranch_execnz .LBB54_2 +; GCN2-NEXT: .LBB54_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_umin_i64_addr64: @@ -3030,15 +7995,39 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB54_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB54_4 +; GFX12-NEXT: .LBB54_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB54_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execnz .LBB54_2 +; GFX12-NEXT: .LBB54_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -3049,38 +8038,105 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_umin_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB55_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB55_3 +; GCN1-NEXT: s_branch .LBB55_4 +; GCN1-NEXT: .LBB55_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB55_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB55_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB55_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB55_3 +; GCN2-NEXT: s_branch .LBB55_4 +; GCN2-NEXT: .LBB55_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB55_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB55_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -3088,15 +8144,37 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-LABEL: atomic_umin_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB55_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_cbranch_execz .LBB55_3 +; GFX12-NEXT: s_branch .LBB55_4 +; GFX12-NEXT: .LBB55_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB55_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e32 v3, s5, v1, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e32 v2, s4, v0, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB55_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -3110,43 +8188,134 @@ entry: define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_or_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB56_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB56_4 +; GCN1-NEXT: .LBB56_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB56_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB56_2 +; GCN1-NEXT: .LBB56_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB56_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB56_4 +; GCN2-NEXT: .LBB56_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB56_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB56_2 +; GCN2-NEXT: .LBB56_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB56_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB56_4 +; GFX12-NEXT: .LBB56_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB56_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB56_2 +; GFX12-NEXT: .LBB56_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -3157,39 +8326,102 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_or_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB57_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB57_3 +; GCN1-NEXT: s_branch .LBB57_4 +; GCN1-NEXT: .LBB57_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB57_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v4, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, s1, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB57_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB57_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB57_3 +; GCN2-NEXT: s_branch .LBB57_4 +; GCN2-NEXT: .LBB57_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB57_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v4, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v5, s1, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB57_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -3198,12 +8430,34 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB57_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB57_3 +; GFX12-NEXT: s_branch .LBB57_4 +; GFX12-NEXT: .LBB57_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB57_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_or_b32_e32 v3, s1, v1 +; GFX12-NEXT: v_or_b32_e32 v2, s0, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB57_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -3217,40 +8471,107 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB58_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB58_4 +; GCN1-NEXT: .LBB58_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB58_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB58_2 +; GCN1-NEXT: .LBB58_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB58_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB58_4 +; GCN2-NEXT: .LBB58_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB58_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB58_2 +; GCN2-NEXT: .LBB58_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_addr64_offset: @@ -3260,13 +8581,37 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB58_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB58_4 +; GFX12-NEXT: .LBB58_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB58_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB58_2 +; GFX12-NEXT: .LBB58_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -3278,43 +8623,106 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB59_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB59_3 +; GCN1-NEXT: s_branch .LBB59_4 +; GCN1-NEXT: .LBB59_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB59_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v4, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, s9, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB59_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB59_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB59_3 +; GCN2-NEXT: s_branch .LBB59_4 +; GCN2-NEXT: .LBB59_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB59_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v4, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v5, s9, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB59_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -3323,13 +8731,35 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB59_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB59_3 +; GFX12-NEXT: s_branch .LBB59_4 +; GFX12-NEXT: .LBB59_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB59_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_or_b32_e32 v3, s5, v1 +; GFX12-NEXT: v_or_b32_e32 v2, s4, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB59_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -3344,39 +8774,128 @@ entry: define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_or_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB60_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB60_4 +; GCN1-NEXT: .LBB60_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB60_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB60_2 +; GCN1-NEXT: .LBB60_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB60_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB60_4 +; GCN2-NEXT: .LBB60_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB60_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB60_2 +; GCN2-NEXT: .LBB60_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB60_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB60_4 +; GFX12-NEXT: .LBB60_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB60_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB60_2 +; GFX12-NEXT: .LBB60_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst @@ -3386,16 +8905,48 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_or_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB61_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB61_3 +; GCN1-NEXT: s_branch .LBB61_4 +; GCN1-NEXT: .LBB61_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB61_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v4, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, s1, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB61_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3403,16 +8954,47 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_or_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB61_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB61_3 +; GCN2-NEXT: s_branch .LBB61_4 +; GCN2-NEXT: .LBB61_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB61_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v4, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v5, s1, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB61_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3423,12 +9005,32 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB61_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB61_3 +; GFX12-NEXT: s_branch .LBB61_4 +; GFX12-NEXT: .LBB61_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB61_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_or_b32_e32 v3, s1, v1 +; GFX12-NEXT: v_or_b32_e32 v2, s0, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB61_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -3441,36 +9043,103 @@ entry: define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB62_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB62_4 +; GCN1-NEXT: .LBB62_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB62_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB62_2 +; GCN1-NEXT: .LBB62_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB62_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB62_4 +; GCN2-NEXT: .LBB62_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB62_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB62_2 +; GCN2-NEXT: .LBB62_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_addr64: @@ -3478,15 +9147,38 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB62_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB62_4 +; GFX12-NEXT: .LBB62_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB62_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB62_2 +; GFX12-NEXT: .LBB62_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -3497,54 +9189,138 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_or_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB63_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB63_3 +; GCN1-NEXT: s_branch .LBB63_4 +; GCN1-NEXT: .LBB63_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB63_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v4, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, s9, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB63_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB63_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB63_3 +; GCN2-NEXT: s_branch .LBB63_4 +; GCN2-NEXT: .LBB63_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB63_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v4, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v5, s9, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB63_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_or_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB63_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB63_3 +; GFX12-NEXT: s_branch .LBB63_4 +; GFX12-NEXT: .LBB63_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB63_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_or_b32_e32 v3, s5, v1 +; GFX12-NEXT: v_or_b32_e32 v2, s4, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB63_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -3558,43 +9334,123 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB64_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB64_4 +; GCN1-NEXT: .LBB64_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB64_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB64_2 +; GCN1-NEXT: .LBB64_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB64_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB64_4 +; GCN2-NEXT: .LBB64_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB64_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB64_2 +; GCN2-NEXT: .LBB64_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB64_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB64_4 +; GFX12-NEXT: .LBB64_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB64_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB64_2 +; GFX12-NEXT: .LBB64_4: ; %atomicrmw.private +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -3605,43 +9461,123 @@ entry: define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GCN1-LABEL: atomic_xchg_f64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB65_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB65_4 +; GCN1-NEXT: .LBB65_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB65_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB65_2 +; GCN1-NEXT: .LBB65_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_f64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB65_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB65_4 +; GCN2-NEXT: .LBB65_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB65_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB65_2 +; GCN2-NEXT: .LBB65_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_f64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB65_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB65_4 +; GFX12-NEXT: .LBB65_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB65_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB65_2 +; GFX12-NEXT: .LBB65_4: ; %atomicrmw.private +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr double, ptr %out, i64 4 @@ -3652,43 +9588,123 @@ entry: define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GCN1-LABEL: atomic_xchg_pointer_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB66_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB66_4 +; GCN1-NEXT: .LBB66_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB66_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB66_2 +; GCN1-NEXT: .LBB66_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_pointer_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB66_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB66_4 +; GCN2-NEXT: .LBB66_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB66_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB66_2 +; GCN2-NEXT: .LBB66_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_pointer_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB66_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB66_4 +; GFX12-NEXT: .LBB66_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB66_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB66_2 +; GFX12-NEXT: .LBB66_4: ; %atomicrmw.private +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr ptr, ptr %out, i32 4 @@ -3699,39 +9715,100 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB67_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB67_3 +; GCN1-NEXT: s_branch .LBB67_4 +; GCN1-NEXT: .LBB67_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB67_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s1 +; GCN1-NEXT: buffer_store_dword v2, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB67_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: s_waitcnt vmcnt(2) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB67_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB67_3 +; GCN2-NEXT: s_branch .LBB67_4 +; GCN2-NEXT: .LBB67_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB67_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s1 +; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB67_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: s_waitcnt vmcnt(2) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -3740,13 +9817,34 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB67_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB67_3 +; GFX12-NEXT: s_branch .LBB67_4 +; GFX12-NEXT: .LBB67_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB67_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB67_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -3759,40 +9857,99 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB68_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB68_4 +; GCN1-NEXT: .LBB68_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB68_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB68_2 +; GCN1-NEXT: .LBB68_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB68_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB68_4 +; GCN2-NEXT: .LBB68_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB68_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB68_2 +; GCN2-NEXT: .LBB68_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64_addr64_offset: @@ -3802,13 +9959,34 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB68_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB68_4 +; GFX12-NEXT: .LBB68_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB68_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB68_2 +; GFX12-NEXT: .LBB68_4: ; %atomicrmw.private +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -3820,43 +9998,104 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB69_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB69_3 +; GCN1-NEXT: s_branch .LBB69_4 +; GCN1-NEXT: .LBB69_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB69_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s9 +; GCN1-NEXT: buffer_store_dword v2, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB69_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: s_waitcnt vmcnt(2) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB69_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB69_3 +; GCN2-NEXT: s_branch .LBB69_4 +; GCN2-NEXT: .LBB69_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB69_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s8 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s9 +; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB69_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: s_waitcnt vmcnt(2) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -3865,14 +10104,35 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB69_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB69_3 +; GFX12-NEXT: s_branch .LBB69_4 +; GFX12-NEXT: .LBB69_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB69_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB69_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -3886,39 +10146,117 @@ entry: define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB70_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB70_4 +; GCN1-NEXT: .LBB70_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB70_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB70_2 +; GCN1-NEXT: .LBB70_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB70_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB70_4 +; GCN2-NEXT: .LBB70_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB70_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB70_2 +; GCN2-NEXT: .LBB70_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB70_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB70_4 +; GFX12-NEXT: .LBB70_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB70_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB70_2 +; GFX12-NEXT: .LBB70_4: ; %atomicrmw.private +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst @@ -3928,35 +10266,96 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB71_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB71_3 +; GCN1-NEXT: s_branch .LBB71_4 +; GCN1-NEXT: .LBB71_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB71_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s1 +; GCN1-NEXT: buffer_store_dword v2, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB71_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: s_waitcnt vmcnt(2) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB71_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB71_3 +; GCN2-NEXT: s_branch .LBB71_4 +; GCN2-NEXT: .LBB71_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB71_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s1 +; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB71_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: s_waitcnt vmcnt(2) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -3965,13 +10364,32 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB71_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB71_3 +; GFX12-NEXT: s_branch .LBB71_4 +; GFX12-NEXT: .LBB71_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB71_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB71_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -3983,36 +10401,95 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB72_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB72_4 +; GCN1-NEXT: .LBB72_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB72_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB72_2 +; GCN1-NEXT: .LBB72_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB72_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB72_4 +; GCN2-NEXT: .LBB72_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB72_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB72_2 +; GCN2-NEXT: .LBB72_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64_addr64: @@ -4020,15 +10497,35 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB72_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB72_4 +; GFX12-NEXT: .LBB72_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB72_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB72_2 +; GFX12-NEXT: .LBB72_4: ; %atomicrmw.private +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4039,55 +10536,136 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xchg_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB73_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB73_3 +; GCN1-NEXT: s_branch .LBB73_4 +; GCN1-NEXT: .LBB73_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB73_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s9 +; GCN1-NEXT: buffer_store_dword v2, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB73_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: s_waitcnt vmcnt(2) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB73_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB73_3 +; GCN2-NEXT: s_branch .LBB73_4 +; GCN2-NEXT: .LBB73_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB73_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s8 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s9 +; GCN2-NEXT: buffer_store_dword v2, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB73_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: s_waitcnt vmcnt(2) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xchg_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB73_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB73_3 +; GFX12-NEXT: s_branch .LBB73_4 +; GFX12-NEXT: .LBB73_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB73_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB73_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: @@ -4100,43 +10678,134 @@ entry: define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xor_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB74_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB74_4 +; GCN1-NEXT: .LBB74_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB74_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB74_2 +; GCN1-NEXT: .LBB74_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB74_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB74_4 +; GCN2-NEXT: .LBB74_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB74_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB74_2 +; GCN2-NEXT: .LBB74_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB74_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB74_4 +; GFX12-NEXT: .LBB74_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB74_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB74_2 +; GFX12-NEXT: .LBB74_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_xor_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -4147,39 +10816,102 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xor_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB75_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB75_3 +; GCN1-NEXT: s_branch .LBB75_4 +; GCN1-NEXT: .LBB75_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB75_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v4, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, s1, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB75_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB75_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB75_3 +; GCN2-NEXT: s_branch .LBB75_4 +; GCN2-NEXT: .LBB75_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB75_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v4, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v5, s1, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB75_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -4188,12 +10920,34 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB75_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB75_3 +; GFX12-NEXT: s_branch .LBB75_4 +; GFX12-NEXT: .LBB75_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB75_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_xor_b32_e32 v3, s1, v1 +; GFX12-NEXT: v_xor_b32_e32 v2, s0, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB75_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -4207,40 +10961,107 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB76_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB76_4 +; GCN1-NEXT: .LBB76_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB76_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB76_2 +; GCN1-NEXT: .LBB76_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB76_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB76_4 +; GCN2-NEXT: .LBB76_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB76_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB76_2 +; GCN2-NEXT: .LBB76_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_addr64_offset: @@ -4250,13 +11071,37 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB76_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB76_4 +; GFX12-NEXT: .LBB76_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB76_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB76_2 +; GFX12-NEXT: .LBB76_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_xor_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4268,43 +11113,106 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_ret_addr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB77_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB77_3 +; GCN1-NEXT: s_branch .LBB77_4 +; GCN1-NEXT: .LBB77_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB77_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v4, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, s9, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB77_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64_ret_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB77_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB77_3 +; GCN2-NEXT: s_branch .LBB77_4 +; GCN2-NEXT: .LBB77_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB77_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v4, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v5, s9, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB77_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -4313,13 +11221,35 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB77_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB77_3 +; GFX12-NEXT: s_branch .LBB77_4 +; GFX12-NEXT: .LBB77_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB77_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_xor_b32_e32 v3, s5, v1 +; GFX12-NEXT: v_xor_b32_e32 v2, s4, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB77_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -4334,39 +11264,128 @@ entry: define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_xor_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB78_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB78_4 +; GCN1-NEXT: .LBB78_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB78_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB78_2 +; GCN1-NEXT: .LBB78_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB78_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB78_4 +; GCN2-NEXT: .LBB78_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB78_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB78_2 +; GCN2-NEXT: .LBB78_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB78_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB78_4 +; GFX12-NEXT: .LBB78_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB78_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB78_2 +; GFX12-NEXT: .LBB78_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_xor_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst @@ -4376,16 +11395,48 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xor_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB79_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB79_3 +; GCN1-NEXT: s_branch .LBB79_4 +; GCN1-NEXT: .LBB79_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB79_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v4, s0, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, s1, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB79_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -4393,16 +11444,47 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_xor_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB79_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB79_3 +; GCN2-NEXT: s_branch .LBB79_4 +; GCN2-NEXT: .LBB79_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB79_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v4, s0, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v5, s1, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB79_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -4413,12 +11495,32 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB79_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB79_3 +; GFX12-NEXT: s_branch .LBB79_4 +; GFX12-NEXT: .LBB79_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB79_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_xor_b32_e32 v3, s1, v1 +; GFX12-NEXT: v_xor_b32_e32 v2, s0, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB79_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -4431,36 +11533,103 @@ entry: define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB80_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB80_4 +; GCN1-NEXT: .LBB80_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB80_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB80_2 +; GCN1-NEXT: .LBB80_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s0 +; GCN1-NEXT: buffer_load_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB80_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB80_4 +; GCN2-NEXT: .LBB80_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB80_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB80_2 +; GCN2-NEXT: .LBB80_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s0 +; GCN2-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_addr64: @@ -4468,15 +11637,38 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB80_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB80_4 +; GFX12-NEXT: .LBB80_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB80_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB80_2 +; GFX12-NEXT: .LBB80_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_xor_b32_e32 v1, s3, v1 +; GFX12-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -4487,54 +11679,138 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_xor_i64_ret_addr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB81_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB81_3 +; GCN1-NEXT: s_branch .LBB81_4 +; GCN1-NEXT: .LBB81_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB81_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v4, s8, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, s9, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB81_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64_ret_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB81_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB81_3 +; GCN2-NEXT: s_branch .LBB81_4 +; GCN2-NEXT: .LBB81_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB81_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v4, s8, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v5, s9, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB81_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_xor_i64_ret_addr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB81_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB81_3 +; GFX12-NEXT: s_branch .LBB81_4 +; GFX12-NEXT: .LBB81_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB81_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_xor_b32_e32 v3, s5, v1 +; GFX12-NEXT: v_xor_b32_e32 v2, s4, v0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB81_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -5908,43 +13184,143 @@ entry: define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_inc_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB107_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB107_4 +; GCN1-NEXT: .LBB107_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB107_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB107_2 +; GCN1-NEXT: .LBB107_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB107_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB107_4 +; GCN2-NEXT: .LBB107_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB107_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB107_2 +; GCN2-NEXT: .LBB107_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB107_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB107_4 +; GFX12-NEXT: .LBB107_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB107_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB107_2 +; GFX12-NEXT: .LBB107_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -5955,39 +13331,108 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_inc_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB108_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB108_3 +; GCN1-NEXT: s_branch .LBB108_4 +; GCN1-NEXT: .LBB108_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB108_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB108_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB108_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB108_3 +; GCN2-NEXT: s_branch .LBB108_4 +; GCN2-NEXT: .LBB108_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB108_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB108_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -5996,12 +13441,37 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s3, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB108_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB108_3 +; GFX12-NEXT: s_branch .LBB108_4 +; GFX12-NEXT: .LBB108_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB108_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX12-NEXT: s_cselect_b32 s2, s2, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB108_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -6015,40 +13485,113 @@ entry: define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB109_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB109_4 +; GCN1-NEXT: .LBB109_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB109_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB109_2 +; GCN1-NEXT: .LBB109_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB109_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB109_4 +; GCN2-NEXT: .LBB109_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB109_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB109_2 +; GCN2-NEXT: .LBB109_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_incr64_offset: @@ -6058,13 +13601,40 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB109_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB109_4 +; GFX12-NEXT: .LBB109_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB109_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB109_2 +; GFX12-NEXT: .LBB109_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -6076,43 +13646,112 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_ret_incr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB110_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB110_3 +; GCN1-NEXT: s_branch .LBB110_4 +; GCN1-NEXT: .LBB110_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB110_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB110_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64_ret_incr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB110_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB110_3 +; GCN2-NEXT: s_branch .LBB110_4 +; GCN2-NEXT: .LBB110_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB110_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB110_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -6121,13 +13760,38 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB110_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB110_3 +; GFX12-NEXT: s_branch .LBB110_4 +; GFX12-NEXT: .LBB110_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB110_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB110_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -6142,39 +13806,137 @@ entry: define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_inc_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB111_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB111_4 +; GCN1-NEXT: .LBB111_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB111_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB111_2 +; GCN1-NEXT: .LBB111_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB111_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB111_4 +; GCN2-NEXT: .LBB111_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB111_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB111_2 +; GCN2-NEXT: .LBB111_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB111_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB111_4 +; GFX12-NEXT: .LBB111_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB111_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB111_2 +; GFX12-NEXT: .LBB111_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst @@ -6184,16 +13946,51 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_inc_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB112_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB112_3 +; GCN1-NEXT: s_branch .LBB112_4 +; GCN1-NEXT: .LBB112_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB112_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB112_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -6201,16 +13998,50 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_inc_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB112_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB112_3 +; GCN2-NEXT: s_branch .LBB112_4 +; GCN2-NEXT: .LBB112_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB112_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB112_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -6221,12 +14052,35 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s3 +; GFX12-NEXT: s_cselect_b32 s2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_vccz .LBB112_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB112_3 +; GFX12-NEXT: s_branch .LBB112_4 +; GFX12-NEXT: .LBB112_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB112_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s2, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s2 +; GFX12-NEXT: .LBB112_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -6239,36 +14093,109 @@ entry: define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB113_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB113_4 +; GCN1-NEXT: .LBB113_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB113_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB113_2 +; GCN1-NEXT: .LBB113_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB113_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB113_4 +; GCN2-NEXT: .LBB113_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB113_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB113_2 +; GCN2-NEXT: .LBB113_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_incr64: @@ -6276,15 +14203,41 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB113_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB113_4 +; GFX12-NEXT: .LBB113_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB113_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB113_2 +; GFX12-NEXT: .LBB113_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -6295,54 +14248,147 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_inc_i64_ret_incr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB114_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB114_3 +; GCN1-NEXT: s_branch .LBB114_4 +; GCN1-NEXT: .LBB114_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB114_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: buffer_store_dword v4, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB114_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64_ret_incr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB114_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB114_3 +; GCN2-NEXT: s_branch .LBB114_4 +; GCN2-NEXT: .LBB114_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB114_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: buffer_store_dword v4, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB114_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_inc_i64_ret_incr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB114_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB114_3 +; GFX12-NEXT: s_branch .LBB114_4 +; GFX12-NEXT: .LBB114_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB114_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, 1 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 +; GFX12-NEXT: .LBB114_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -6356,43 +14402,154 @@ entry: define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_dec_i64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s4, 32 +; GCN1-NEXT: s_addc_u32 s1, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB115_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB115_4 +; GCN1-NEXT: .LBB115_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB115_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB115_2 +; GCN1-NEXT: .LBB115_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s4, 32 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB115_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB115_4 +; GCN2-NEXT: .LBB115_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB115_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB115_2 +; GCN2-NEXT: .LBB115_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB115_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB115_4 +; GFX12-NEXT: .LBB115_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB115_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB115_2 +; GFX12-NEXT: .LBB115_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s4, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] +; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -6403,39 +14560,116 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_dec_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: s_cmp_eq_u32 s3, s8 +; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_cbranch_vccz .LBB116_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB116_3 +; GCN1-NEXT: s_branch .LBB116_4 +; GCN1-NEXT: .LBB116_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB116_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN1-NEXT: s_cselect_b32 s2, s2, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e64 v6, s[2:3], -1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], v[0:1] +; GCN1-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB116_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: s_cmp_eq_u32 s3, s8 +; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_cbranch_vccz .LBB116_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB116_3 +; GCN2-NEXT: s_branch .LBB116_4 +; GCN2-NEXT: .LBB116_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB116_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GCN2-NEXT: s_cselect_b32 s2, s2, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e64 v6, s[2:3], -1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], v[0:1] +; GCN2-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB116_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -6443,13 +14677,43 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], 32 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB116_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB116_3 +; GFX12-NEXT: s_branch .LBB116_4 +; GFX12-NEXT: .LBB116_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB116_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s4, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] +; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s3, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s2, s0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s4 +; GFX12-NEXT: .LBB116_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -6463,40 +14727,119 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB117_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB117_4 +; GCN1-NEXT: .LBB117_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB117_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB117_2 +; GCN1-NEXT: .LBB117_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB117_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB117_4 +; GCN2-NEXT: .LBB117_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB117_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB117_2 +; GCN2-NEXT: .LBB117_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_decr64_offset: @@ -6506,13 +14849,45 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB117_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB117_4 +; GFX12-NEXT: .LBB117_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB117_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB117_2 +; GFX12-NEXT: .LBB117_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s4, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] +; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -6524,43 +14899,120 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_ret_decr64_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB118_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB118_3 +; GCN1-NEXT: s_branch .LBB118_4 +; GCN1-NEXT: .LBB118_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB118_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e64 v6, s[2:3], -1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[0:1] +; GCN1-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB118_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_ret_decr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB118_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB118_3 +; GCN2-NEXT: s_branch .LBB118_4 +; GCN2-NEXT: .LBB118_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB118_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e64 v6, s[2:3], -1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[0:1] +; GCN2-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB118_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -6569,13 +15021,43 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 32 +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB118_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB118_3 +; GFX12-NEXT: s_branch .LBB118_4 +; GFX12-NEXT: .LBB118_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB118_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s6, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1] +; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 +; GFX12-NEXT: .LBB118_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -6590,39 +15072,148 @@ entry: define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GCN1-LABEL: atomic_dec_i64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 -; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s0 -; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s0, s[2:3], 0x3d +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s0 +; GCN1-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_mov_b64 s[0:1], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB119_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN1-NEXT: s_cbranch_vccz .LBB119_4 +; GCN1-NEXT: .LBB119_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB119_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB119_2 +; GCN1-NEXT: .LBB119_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN1-NEXT: s_cselect_b32 s0, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s0, s[2:3], 0xf4 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s0 +; GCN2-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_mov_b64 s[0:1], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB119_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GCN2-NEXT: s_cbranch_vccz .LBB119_4 +; GCN2-NEXT: .LBB119_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB119_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB119_2 +; GCN2-NEXT: .LBB119_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s0, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s1, s5 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB119_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB119_4 +; GFX12-NEXT: .LBB119_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB119_3: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB119_2 +; GFX12-NEXT: .LBB119_4: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s4, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] +; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst @@ -6632,16 +15223,55 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_dec_i64_ret: ; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd -; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s8 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB120_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB120_3 +; GCN1-NEXT: s_branch .LBB120_4 +; GCN1-NEXT: .LBB120_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB120_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s1 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s2, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: s_add_i32 s2, s2, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e64 v6, s[2:3], -1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], v[0:1] +; GCN1-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB120_4: ; %atomicrmw.end ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -6649,16 +15279,54 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; ; GCN2-LABEL: atomic_dec_i64_ret: ; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 -; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s8 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB120_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB120_3 +; GCN2-NEXT: s_branch .LBB120_4 +; GCN2-NEXT: .LBB120_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB120_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s2, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: s_add_i32 s2, s2, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s1 +; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e64 v6, s[2:3], -1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], v[0:1] +; GCN2-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB120_4: ; %atomicrmw.end ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -6668,13 +15336,42 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x34 +; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s5, s1 +; GFX12-NEXT: s_cselect_b32 s0, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_vccz .LBB120_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB120_3 +; GFX12-NEXT: s_branch .LBB120_4 +; GFX12-NEXT: .LBB120_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB120_3: ; %atomicrmw.private +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s4, s4, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] +; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s3, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s2, s0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s4 +; GFX12-NEXT: .LBB120_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm @@ -6687,36 +15384,115 @@ entry: define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x3f +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN1-NEXT: s_add_u32 s0, s4, s0 ; GCN1-NEXT: s_addc_u32 s1, s5, s1 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_mov_b64 s[2:3], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB121_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB121_4 +; GCN1-NEXT: .LBB121_2: ; %atomicrmw.phi +; GCN1-NEXT: s_endpgm +; GCN1-NEXT: .LBB121_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB121_2 +; GCN1-NEXT: .LBB121_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0xfc +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; GCN2-NEXT: s_add_u32 s0, s4, s0 ; GCN2-NEXT: s_addc_u32 s1, s5, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_mov_b64 s[2:3], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB121_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB121_4 +; GCN2-NEXT: .LBB121_2: ; %atomicrmw.phi +; GCN2-NEXT: s_endpgm +; GCN2-NEXT: .LBB121_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB121_2 +; GCN2-NEXT: .LBB121_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1] +; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_decr64: @@ -6724,15 +15500,46 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[6:7], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_cmp_eq_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cbranch_vccnz .LBB121_3 +; GFX12-NEXT: ; %bb.1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccz .LBB121_4 +; GFX12-NEXT: .LBB121_2: ; %atomicrmw.phi +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB121_3: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execnz .LBB121_2 +; GFX12-NEXT: .LBB121_4: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s4, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] +; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, s1, -1, v1, s1 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -6743,54 +15550,160 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %in, i64 %index) { ; GCN1-LABEL: atomic_dec_i64_ret_decr64: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN1-NEXT: s_add_u32 s12, s12, s9 +; GCN1-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9 +; GCN1-NEXT: s_load_dword s2, s[2:3], 0x41 +; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN1-NEXT: s_add_u32 s0, s0, s4 -; GCN1-NEXT: s_addc_u32 s1, s1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_cmp_eq_u32 s1, s2 +; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cbranch_vccz .LBB122_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s8 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s9 +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_cbranch_execz .LBB122_3 +; GCN1-NEXT: s_branch .LBB122_4 +; GCN1-NEXT: .LBB122_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB122_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s8 +; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s0 +; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v4, s9 +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e64 v6, s[2:3], -1, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[0:1] +; GCN1-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen +; GCN1-NEXT: .LBB122_4: ; %atomicrmw.end +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_ret_decr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_mov_b32 s91, 0xe80000 +; GCN2-NEXT: s_add_u32 s88, s88, s9 +; GCN2-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 +; GCN2-NEXT: s_load_dword s2, s[2:3], 0x104 +; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_cmp_eq_u32 s1, s2 +; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cbranch_vccz .LBB122_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s8 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s9 +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_cbranch_execz .LBB122_3 +; GCN2-NEXT: s_branch .LBB122_4 +; GCN2-NEXT: .LBB122_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB122_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s0 +; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v5, s8 +; GCN2-NEXT: v_mov_b32_e32 v4, s9 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e64 v6, s[2:3], -1, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[0:1] +; GCN2-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen +; GCN2-NEXT: .LBB122_4: ; %atomicrmw.end +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_ret_decr64: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_cmp_eq_u32 s1, s9 +; GFX12-NEXT: s_cselect_b32 s6, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccz .LBB122_2 +; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_cbranch_execz .LBB122_3 +; GFX12-NEXT: s_branch .LBB122_4 +; GFX12-NEXT: .LBB122_2: +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX12-NEXT: .LBB122_3: ; %atomicrmw.private +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX12-NEXT: s_cselect_b32 s6, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1] +; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s1, -1, v1, s1 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 +; GFX12-NEXT: .LBB122_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 3fd624b592cd4..d7bd4b1e4918e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -11,25 +11,100 @@ define void @flat_atomic_xchg_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_xchg_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB0_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB0_4 +; GCN1-NEXT: .LBB0_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB0_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB0_2 +; GCN1-NEXT: .LBB0_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB0_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB0_4 +; GCN2-NEXT: .LBB0_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB0_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB0_2 +; GCN2-NEXT: .LBB0_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB0_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB0_4 +; GCN3-NEXT: .LBB0_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB0_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB0_2 +; GCN3-NEXT: .LBB0_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst ret void @@ -39,29 +114,106 @@ define void @flat_atomic_xchg_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB1_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB1_4 +; GCN1-NEXT: .LBB1_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB1_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB1_2 +; GCN1-NEXT: .LBB1_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB1_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB1_4 +; GCN2-NEXT: .LBB1_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB1_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB1_2 +; GCN2-NEXT: .LBB1_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB1_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB1_4 +; GCN3-NEXT: .LBB1_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB1_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB1_2 +; GCN3-NEXT: .LBB1_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst @@ -72,25 +224,116 @@ define i64 @flat_atomic_xchg_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_xchg_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB2_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB2_4 +; GCN1-NEXT: .LBB2_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB2_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB2_2 +; GCN1-NEXT: .LBB2_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB2_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB2_4 +; GCN2-NEXT: .LBB2_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB2_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB2_2 +; GCN2-NEXT: .LBB2_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v5, v1 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB2_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB2_4 +; GCN3-NEXT: .LBB2_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB2_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB2_2 +; GCN3-NEXT: .LBB2_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_nop 0 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr %ptr, i64 %in seq_cst ret i64 %result @@ -100,29 +343,116 @@ define i64 @flat_atomic_xchg_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB3_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB3_4 +; GCN1-NEXT: .LBB3_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB3_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB3_2 +; GCN1-NEXT: .LBB3_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB3_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB3_4 +; GCN2-NEXT: .LBB3_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB3_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB3_2 +; GCN2-NEXT: .LBB3_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB3_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB3_4 +; GCN3-NEXT: .LBB3_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB3_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB3_2 +; GCN3-NEXT: .LBB3_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_nop 0 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst @@ -133,37 +463,112 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-LABEL: flat_atomic_xchg_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB4_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB4_4 +; GCN1-NEXT: .LBB4_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB4_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB4_2 +; GCN1-NEXT: .LBB4_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB4_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB4_4 +; GCN2-NEXT: .LBB4_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB4_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB4_2 +; GCN2-NEXT: .LBB4_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB4_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB4_4 +; GCN3-NEXT: .LBB4_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB4_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB4_2 +; GCN3-NEXT: .LBB4_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v1, s34 +; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst ret void @@ -173,41 +578,118 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB5_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB5_4 +; GCN1-NEXT: .LBB5_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB5_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB5_2 +; GCN1-NEXT: .LBB5_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB5_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB5_4 +; GCN2-NEXT: .LBB5_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB5_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB5_2 +; GCN2-NEXT: .LBB5_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB5_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB5_4 +; GCN3-NEXT: .LBB5_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB5_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB5_2 +; GCN3-NEXT: .LBB5_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v1, s34 +; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst @@ -218,37 +700,112 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-LABEL: flat_atomic_xchg_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB6_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB6_3 +; GCN1-NEXT: s_branch .LBB6_4 +; GCN1-NEXT: .LBB6_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB6_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s7 +; GCN1-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB6_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB6_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB6_3 +; GCN2-NEXT: s_branch .LBB6_4 +; GCN2-NEXT: .LBB6_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB6_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s7 +; GCN2-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB6_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB6_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB6_3 +; GCN3-NEXT: s_branch .LBB6_4 +; GCN3-NEXT: .LBB6_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB6_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s6 +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB6_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr %ptr, i64 %in seq_cst ret i64 %result @@ -258,41 +815,118 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB7_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB7_3 +; GCN1-NEXT: s_branch .LBB7_4 +; GCN1-NEXT: .LBB7_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB7_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s7 +; GCN1-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB7_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB7_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB7_3 +; GCN2-NEXT: s_branch .LBB7_4 +; GCN2-NEXT: .LBB7_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB7_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s7 +; GCN2-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB7_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB7_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB7_3 +; GCN3-NEXT: s_branch .LBB7_4 +; GCN3-NEXT: .LBB7_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB7_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s6 +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB7_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst @@ -303,29 +937,106 @@ define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB8_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB8_4 +; GCN1-NEXT: .LBB8_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB8_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB8_2 +; GCN1-NEXT: .LBB8_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB8_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB8_4 +; GCN2-NEXT: .LBB8_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB8_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB8_2 +; GCN2-NEXT: .LBB8_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB8_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB8_4 +; GCN3-NEXT: .LBB8_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB8_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB8_2 +; GCN3-NEXT: .LBB8_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -336,29 +1047,116 @@ define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB9_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB9_4 +; GCN1-NEXT: .LBB9_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB9_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB9_2 +; GCN1-NEXT: .LBB9_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB9_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB9_4 +; GCN2-NEXT: .LBB9_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB9_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB9_2 +; GCN2-NEXT: .LBB9_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB9_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB9_4 +; GCN3-NEXT: .LBB9_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB9_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB9_2 +; GCN3-NEXT: .LBB9_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_nop 0 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -373,25 +1171,100 @@ define void @flat_atomic_xchg_f64_noret(ptr %ptr, double %in) { ; GCN1-LABEL: flat_atomic_xchg_f64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB10_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB10_4 +; GCN1-NEXT: .LBB10_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB10_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB10_2 +; GCN1-NEXT: .LBB10_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB10_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB10_4 +; GCN2-NEXT: .LBB10_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB10_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB10_2 +; GCN2-NEXT: .LBB10_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB10_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB10_4 +; GCN3-NEXT: .LBB10_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB10_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB10_2 +; GCN3-NEXT: .LBB10_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst ret void @@ -401,29 +1274,106 @@ define void @flat_atomic_xchg_f64_noret_offset(ptr %out, double %in) { ; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB11_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB11_4 +; GCN1-NEXT: .LBB11_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB11_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB11_2 +; GCN1-NEXT: .LBB11_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB11_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB11_4 +; GCN2-NEXT: .LBB11_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB11_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB11_2 +; GCN2-NEXT: .LBB11_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB11_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB11_4 +; GCN3-NEXT: .LBB11_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB11_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB11_2 +; GCN3-NEXT: .LBB11_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst @@ -434,25 +1384,116 @@ define double @flat_atomic_xchg_f64_ret(ptr %ptr, double %in) { ; GCN1-LABEL: flat_atomic_xchg_f64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB12_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB12_4 +; GCN1-NEXT: .LBB12_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB12_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB12_2 +; GCN1-NEXT: .LBB12_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB12_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB12_4 +; GCN2-NEXT: .LBB12_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB12_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB12_2 +; GCN2-NEXT: .LBB12_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v5, v1 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB12_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB12_4 +; GCN3-NEXT: .LBB12_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB12_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB12_2 +; GCN3-NEXT: .LBB12_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_nop 0 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr %ptr, double %in seq_cst ret double %result @@ -462,29 +1503,116 @@ define double @flat_atomic_xchg_f64_ret_offset(ptr %out, double %in) { ; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB13_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB13_4 +; GCN1-NEXT: .LBB13_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB13_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB13_2 +; GCN1-NEXT: .LBB13_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB13_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB13_4 +; GCN2-NEXT: .LBB13_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB13_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB13_2 +; GCN2-NEXT: .LBB13_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB13_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB13_4 +; GCN3-NEXT: .LBB13_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB13_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB13_2 +; GCN3-NEXT: .LBB13_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_nop 0 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst @@ -495,37 +1623,112 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double ; GCN1-LABEL: flat_atomic_xchg_f64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB14_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB14_4 +; GCN1-NEXT: .LBB14_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB14_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB14_2 +; GCN1-NEXT: .LBB14_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB14_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB14_4 +; GCN2-NEXT: .LBB14_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB14_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB14_2 +; GCN2-NEXT: .LBB14_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB14_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB14_4 +; GCN3-NEXT: .LBB14_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB14_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB14_2 +; GCN3-NEXT: .LBB14_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v1, s34 +; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst ret void @@ -535,41 +1738,118 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB15_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB15_4 +; GCN1-NEXT: .LBB15_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB15_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB15_2 +; GCN1-NEXT: .LBB15_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB15_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB15_4 +; GCN2-NEXT: .LBB15_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB15_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB15_2 +; GCN2-NEXT: .LBB15_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB15_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB15_4 +; GCN3-NEXT: .LBB15_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB15_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB15_2 +; GCN3-NEXT: .LBB15_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v1, s34 +; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst @@ -580,37 +1860,112 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double ; GCN1-LABEL: flat_atomic_xchg_f64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB16_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB16_3 +; GCN1-NEXT: s_branch .LBB16_4 +; GCN1-NEXT: .LBB16_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB16_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s7 +; GCN1-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB16_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB16_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB16_3 +; GCN2-NEXT: s_branch .LBB16_4 +; GCN2-NEXT: .LBB16_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB16_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s7 +; GCN2-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB16_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB16_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB16_3 +; GCN3-NEXT: s_branch .LBB16_4 +; GCN3-NEXT: .LBB16_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB16_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s6 +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB16_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr %ptr, double %in seq_cst ret double %result @@ -620,41 +1975,118 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB17_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB17_3 +; GCN1-NEXT: s_branch .LBB17_4 +; GCN1-NEXT: .LBB17_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB17_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s7 +; GCN1-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB17_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB17_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB17_3 +; GCN2-NEXT: s_branch .LBB17_4 +; GCN2-NEXT: .LBB17_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB17_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s7 +; GCN2-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB17_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB17_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB17_3 +; GCN3-NEXT: s_branch .LBB17_4 +; GCN3-NEXT: .LBB17_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB17_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s6 +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB17_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst @@ -665,29 +2097,106 @@ define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB18_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB18_4 +; GCN1-NEXT: .LBB18_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB18_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB18_2 +; GCN1-NEXT: .LBB18_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB18_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB18_4 +; GCN2-NEXT: .LBB18_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB18_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB18_2 +; GCN2-NEXT: .LBB18_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB18_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB18_4 +; GCN3-NEXT: .LBB18_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB18_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB18_2 +; GCN3-NEXT: .LBB18_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0 @@ -698,29 +2207,116 @@ define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB19_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB19_4 +; GCN1-NEXT: .LBB19_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB19_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB19_2 +; GCN1-NEXT: .LBB19_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB19_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB19_4 +; GCN2-NEXT: .LBB19_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB19_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB19_2 +; GCN2-NEXT: .LBB19_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB19_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB19_4 +; GCN3-NEXT: .LBB19_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB19_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB19_2 +; GCN3-NEXT: .LBB19_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_nop 0 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0 @@ -735,25 +2331,118 @@ define void @flat_atomic_add_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_add_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB20_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB20_4 +; GCN1-NEXT: .LBB20_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB20_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB20_2 +; GCN1-NEXT: .LBB20_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB20_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB20_4 +; GCN2-NEXT: .LBB20_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB20_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB20_2 +; GCN2-NEXT: .LBB20_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB20_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB20_4 +; GCN3-NEXT: .LBB20_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB20_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB20_2 +; GCN3-NEXT: .LBB20_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst ret void @@ -763,29 +2452,124 @@ define void @flat_atomic_add_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_add_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB21_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB21_4 +; GCN1-NEXT: .LBB21_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB21_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB21_2 +; GCN1-NEXT: .LBB21_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB21_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB21_4 +; GCN2-NEXT: .LBB21_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB21_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB21_2 +; GCN2-NEXT: .LBB21_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB21_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB21_4 +; GCN3-NEXT: .LBB21_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB21_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB21_2 +; GCN3-NEXT: .LBB21_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst @@ -796,25 +2580,127 @@ define i64 @flat_atomic_add_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_add_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB22_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB22_4 +; GCN1-NEXT: .LBB22_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB22_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB22_2 +; GCN1-NEXT: .LBB22_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB22_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB22_4 +; GCN2-NEXT: .LBB22_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB22_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB22_2 +; GCN2-NEXT: .LBB22_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v2, vcc, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v5, v1 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB22_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB22_4 +; GCN3-NEXT: .LBB22_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB22_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB22_2 +; GCN3-NEXT: .LBB22_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw add ptr %ptr, i64 %in seq_cst ret i64 %result @@ -824,29 +2710,127 @@ define i64 @flat_atomic_add_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_add_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB23_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB23_4 +; GCN1-NEXT: .LBB23_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB23_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB23_2 +; GCN1-NEXT: .LBB23_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB23_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB23_4 +; GCN2-NEXT: .LBB23_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB23_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB23_2 +; GCN2-NEXT: .LBB23_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v2, vcc, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB23_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB23_4 +; GCN3-NEXT: .LBB23_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB23_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB23_2 +; GCN3-NEXT: .LBB23_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst @@ -857,37 +2841,127 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-LABEL: flat_atomic_add_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB24_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB24_4 +; GCN1-NEXT: .LBB24_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB24_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB24_2 +; GCN1-NEXT: .LBB24_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB24_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB24_4 +; GCN2-NEXT: .LBB24_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB24_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB24_2 +; GCN2-NEXT: .LBB24_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB24_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB24_4 +; GCN3-NEXT: .LBB24_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB24_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB24_2 +; GCN3-NEXT: .LBB24_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v1, vcc, s6, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst ret void @@ -897,41 +2971,133 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_add_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB25_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB25_4 +; GCN1-NEXT: .LBB25_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB25_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB25_2 +; GCN1-NEXT: .LBB25_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB25_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB25_4 +; GCN2-NEXT: .LBB25_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB25_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB25_2 +; GCN2-NEXT: .LBB25_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB25_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB25_4 +; GCN3-NEXT: .LBB25_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB25_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB25_2 +; GCN3-NEXT: .LBB25_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v1, vcc, s6, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst @@ -942,37 +3108,121 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-LABEL: flat_atomic_add_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB26_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_setpc_b64 s[30:31] -; +; GCN1-NEXT: s_cbranch_execz .LBB26_3 +; GCN1-NEXT: s_branch .LBB26_4 +; GCN1-NEXT: .LBB26_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB26_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v5, vcc, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB26_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: s_setpc_b64 s[30:31] +; ; GCN2-LABEL: flat_atomic_add_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB26_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB26_3 +; GCN2-NEXT: s_branch .LBB26_4 +; GCN2-NEXT: .LBB26_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB26_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v5, vcc, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB26_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB26_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB26_3 +; GCN3-NEXT: s_branch .LBB26_4 +; GCN3-NEXT: .LBB26_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB26_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, s6, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB26_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw add ptr %ptr, i64 %in seq_cst ret i64 %result @@ -982,41 +3232,127 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-LABEL: flat_atomic_add_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB27_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB27_3 +; GCN1-NEXT: s_branch .LBB27_4 +; GCN1-NEXT: .LBB27_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB27_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v5, vcc, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB27_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB27_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB27_3 +; GCN2-NEXT: s_branch .LBB27_4 +; GCN2-NEXT: .LBB27_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB27_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v5, vcc, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB27_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB27_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB27_3 +; GCN3-NEXT: s_branch .LBB27_4 +; GCN3-NEXT: .LBB27_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB27_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, s6, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB27_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst @@ -1027,29 +3363,124 @@ define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB28_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB28_4 +; GCN1-NEXT: .LBB28_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB28_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB28_2 +; GCN1-NEXT: .LBB28_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB28_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB28_4 +; GCN2-NEXT: .LBB28_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB28_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB28_2 +; GCN2-NEXT: .LBB28_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB28_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB28_4 +; GCN3-NEXT: .LBB28_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB28_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB28_2 +; GCN3-NEXT: .LBB28_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -1060,29 +3491,127 @@ define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB29_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB29_4 +; GCN1-NEXT: .LBB29_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB29_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB29_2 +; GCN1-NEXT: .LBB29_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB29_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB29_4 +; GCN2-NEXT: .LBB29_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB29_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB29_2 +; GCN2-NEXT: .LBB29_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v2, vcc, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB29_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB29_4 +; GCN3-NEXT: .LBB29_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB29_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB29_2 +; GCN3-NEXT: .LBB29_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -1097,25 +3626,118 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_sub_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB30_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB30_4 +; GCN1-NEXT: .LBB30_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB30_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB30_2 +; GCN1-NEXT: .LBB30_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v2, vcc, v5, v3, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB30_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB30_4 +; GCN2-NEXT: .LBB30_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB30_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB30_2 +; GCN2-NEXT: .LBB30_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_sub_u32_e32 v1, vcc, v1, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v2, vcc, v5, v3, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB30_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB30_4 +; GCN3-NEXT: .LBB30_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB30_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB30_2 +; GCN3-NEXT: .LBB30_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v2, vcc, v4, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst ret void @@ -1125,29 +3747,124 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_sub_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB31_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB31_4 +; GCN1-NEXT: .LBB31_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB31_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB31_2 +; GCN1-NEXT: .LBB31_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v2, vcc, v5, v3, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB31_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB31_4 +; GCN2-NEXT: .LBB31_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB31_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB31_2 +; GCN2-NEXT: .LBB31_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_sub_u32_e32 v1, vcc, v1, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v2, vcc, v5, v3, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB31_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB31_4 +; GCN3-NEXT: .LBB31_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB31_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB31_2 +; GCN3-NEXT: .LBB31_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v2, vcc, v4, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst @@ -1158,25 +3875,127 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_sub_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB32_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB32_4 +; GCN1-NEXT: .LBB32_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB32_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB32_2 +; GCN1-NEXT: .LBB32_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB32_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB32_4 +; GCN2-NEXT: .LBB32_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB32_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB32_2 +; GCN2-NEXT: .LBB32_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v5, v1 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB32_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB32_4 +; GCN3-NEXT: .LBB32_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB32_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB32_2 +; GCN3-NEXT: .LBB32_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i64 %in seq_cst ret i64 %result @@ -1186,29 +4005,127 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_sub_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB33_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB33_4 +; GCN1-NEXT: .LBB33_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB33_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB33_2 +; GCN1-NEXT: .LBB33_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB33_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB33_4 +; GCN2-NEXT: .LBB33_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB33_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB33_2 +; GCN2-NEXT: .LBB33_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB33_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB33_4 +; GCN3-NEXT: .LBB33_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB33_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB33_2 +; GCN3-NEXT: .LBB33_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst @@ -1219,37 +4136,127 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-LABEL: flat_atomic_sub_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB34_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB34_4 +; GCN1-NEXT: .LBB34_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB34_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB34_2 +; GCN1-NEXT: .LBB34_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB34_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB34_4 +; GCN2-NEXT: .LBB34_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB34_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB34_2 +; GCN2-NEXT: .LBB34_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB34_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB34_4 +; GCN3-NEXT: .LBB34_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB34_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB34_2 +; GCN3-NEXT: .LBB34_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_subrev_co_u32_e32 v1, vcc, s6, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst ret void @@ -1259,41 +4266,133 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_sub_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB35_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB35_4 +; GCN1-NEXT: .LBB35_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB35_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB35_2 +; GCN1-NEXT: .LBB35_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB35_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB35_4 +; GCN2-NEXT: .LBB35_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB35_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB35_2 +; GCN2-NEXT: .LBB35_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB35_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB35_4 +; GCN3-NEXT: .LBB35_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB35_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB35_2 +; GCN3-NEXT: .LBB35_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_subrev_co_u32_e32 v1, vcc, s6, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst @@ -1304,37 +4403,121 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-LABEL: flat_atomic_sub_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB36_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB36_3 +; GCN1-NEXT: s_branch .LBB36_4 +; GCN1-NEXT: .LBB36_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB36_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB36_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB36_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB36_3 +; GCN2-NEXT: s_branch .LBB36_4 +; GCN2-NEXT: .LBB36_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB36_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB36_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB36_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB36_3 +; GCN3-NEXT: s_branch .LBB36_4 +; GCN3-NEXT: .LBB36_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB36_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB36_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i64 %in seq_cst ret i64 %result @@ -1344,41 +4527,127 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-LABEL: flat_atomic_sub_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB37_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB37_3 +; GCN1-NEXT: s_branch .LBB37_4 +; GCN1-NEXT: .LBB37_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB37_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB37_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB37_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB37_3 +; GCN2-NEXT: s_branch .LBB37_4 +; GCN2-NEXT: .LBB37_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB37_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc +; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB37_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB37_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB37_3 +; GCN3-NEXT: s_branch .LBB37_4 +; GCN3-NEXT: .LBB37_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB37_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB37_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst @@ -1389,29 +4658,124 @@ define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB38_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB38_4 +; GCN1-NEXT: .LBB38_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB38_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB38_2 +; GCN1-NEXT: .LBB38_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v2, vcc, v5, v3, vcc +; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB38_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB38_4 +; GCN2-NEXT: .LBB38_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB38_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB38_2 +; GCN2-NEXT: .LBB38_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_sub_u32_e32 v1, vcc, v1, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v2, vcc, v5, v3, vcc +; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB38_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB38_4 +; GCN3-NEXT: .LBB38_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB38_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB38_2 +; GCN3-NEXT: .LBB38_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v2, vcc, v4, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -1422,29 +4786,127 @@ define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB39_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB39_4 +; GCN1-NEXT: .LBB39_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB39_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB39_2 +; GCN1-NEXT: .LBB39_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB39_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB39_4 +; GCN2-NEXT: .LBB39_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB39_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB39_2 +; GCN2-NEXT: .LBB39_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB39_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB39_4 +; GCN3-NEXT: .LBB39_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB39_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB39_2 +; GCN3-NEXT: .LBB39_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -1459,25 +4921,118 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_and_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB40_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB40_4 +; GCN1-NEXT: .LBB40_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB40_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB40_2 +; GCN1-NEXT: .LBB40_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB40_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB40_4 +; GCN2-NEXT: .LBB40_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB40_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB40_2 +; GCN2-NEXT: .LBB40_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB40_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB40_4 +; GCN3-NEXT: .LBB40_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB40_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB40_2 +; GCN3-NEXT: .LBB40_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst ret void @@ -1487,29 +5042,124 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_and_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB41_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB41_4 +; GCN1-NEXT: .LBB41_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB41_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB41_2 +; GCN1-NEXT: .LBB41_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB41_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB41_4 +; GCN2-NEXT: .LBB41_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB41_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB41_2 +; GCN2-NEXT: .LBB41_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB41_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB41_4 +; GCN3-NEXT: .LBB41_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB41_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB41_2 +; GCN3-NEXT: .LBB41_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst @@ -1520,25 +5170,127 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_and_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB42_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB42_4 +; GCN1-NEXT: .LBB42_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB42_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB42_2 +; GCN1-NEXT: .LBB42_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB42_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB42_4 +; GCN2-NEXT: .LBB42_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB42_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB42_2 +; GCN2-NEXT: .LBB42_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v5, v1 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB42_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB42_4 +; GCN3-NEXT: .LBB42_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB42_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB42_2 +; GCN3-NEXT: .LBB42_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i64 %in seq_cst ret i64 %result @@ -1548,29 +5300,127 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_and_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB43_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB43_4 +; GCN1-NEXT: .LBB43_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB43_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB43_2 +; GCN1-NEXT: .LBB43_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB43_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB43_4 +; GCN2-NEXT: .LBB43_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB43_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB43_2 +; GCN2-NEXT: .LBB43_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB43_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB43_4 +; GCN3-NEXT: .LBB43_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB43_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB43_2 +; GCN3-NEXT: .LBB43_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst @@ -1581,37 +5431,124 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-LABEL: flat_atomic_and_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB44_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB44_4 +; GCN1-NEXT: .LBB44_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB44_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB44_2 +; GCN1-NEXT: .LBB44_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB44_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB44_4 +; GCN2-NEXT: .LBB44_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB44_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB44_2 +; GCN2-NEXT: .LBB44_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB44_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB44_4 +; GCN3-NEXT: .LBB44_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB44_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB44_2 +; GCN3-NEXT: .LBB44_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst ret void @@ -1621,41 +5558,130 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_and_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB45_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB45_4 +; GCN1-NEXT: .LBB45_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB45_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB45_2 +; GCN1-NEXT: .LBB45_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB45_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB45_4 +; GCN2-NEXT: .LBB45_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB45_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB45_2 +; GCN2-NEXT: .LBB45_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB45_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB45_4 +; GCN3-NEXT: .LBB45_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB45_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB45_2 +; GCN3-NEXT: .LBB45_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst @@ -1666,37 +5692,118 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-LABEL: flat_atomic_and_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB46_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB46_3 +; GCN1-NEXT: s_branch .LBB46_4 +; GCN1-NEXT: .LBB46_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB46_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, s7, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB46_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB46_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB46_3 +; GCN2-NEXT: s_branch .LBB46_4 +; GCN2-NEXT: .LBB46_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB46_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, s7, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB46_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB46_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB46_3 +; GCN3-NEXT: s_branch .LBB46_4 +; GCN3-NEXT: .LBB46_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB46_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB46_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i64 %in seq_cst ret i64 %result @@ -1706,41 +5813,124 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-LABEL: flat_atomic_and_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB47_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB47_3 +; GCN1-NEXT: s_branch .LBB47_4 +; GCN1-NEXT: .LBB47_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB47_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, s7, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB47_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB47_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB47_3 +; GCN2-NEXT: s_branch .LBB47_4 +; GCN2-NEXT: .LBB47_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB47_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, s7, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB47_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB47_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB47_3 +; GCN3-NEXT: s_branch .LBB47_4 +; GCN3-NEXT: .LBB47_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB47_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB47_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst @@ -1751,29 +5941,124 @@ define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB48_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB48_4 +; GCN1-NEXT: .LBB48_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB48_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB48_2 +; GCN1-NEXT: .LBB48_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB48_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB48_4 +; GCN2-NEXT: .LBB48_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB48_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB48_2 +; GCN2-NEXT: .LBB48_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB48_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB48_4 +; GCN3-NEXT: .LBB48_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB48_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB48_2 +; GCN3-NEXT: .LBB48_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -1784,29 +6069,127 @@ define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB49_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB49_4 +; GCN1-NEXT: .LBB49_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB49_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB49_2 +; GCN1-NEXT: .LBB49_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB49_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB49_4 +; GCN2-NEXT: .LBB49_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB49_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB49_2 +; GCN2-NEXT: .LBB49_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB49_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB49_4 +; GCN3-NEXT: .LBB49_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB49_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB49_2 +; GCN3-NEXT: .LBB49_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -1821,12 +6204,26 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_nand_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB50_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB50_6 +; GCN1-NEXT: .LBB50_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB50_3: ; %atomicrmw.global ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: flat_load_dword v7, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB50_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1838,23 +6235,58 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB50_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB50_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB50_2 +; GCN1-NEXT: .LBB50_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN1-NEXT: v_not_b32_e32 v2, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB50_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB50_6 +; GCN2-NEXT: .LBB50_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB50_3: ; %atomicrmw.global ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB50_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1866,20 +6298,53 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: v_mov_b32_e32 v7, v5 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB50_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB50_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB50_2 +; GCN2-NEXT: .LBB50_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN2-NEXT: v_not_b32_e32 v2, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB50_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB50_6 +; GCN3-NEXT: .LBB50_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB50_3: ; %atomicrmw.global ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB50_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1891,12 +6356,32 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB50_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB50_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB50_2 +; GCN3-NEXT: .LBB50_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN3-NEXT: v_not_b32_e32 v2, v2 +; GCN3-NEXT: v_not_b32_e32 v1, v1 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst ret void @@ -1906,17 +6391,211 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_nand_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v7, v[0:1] -; GCN1-NEXT: flat_load_dword v6, v[8:9] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB51_1: ; %atomicrmw.start +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB51_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB51_6 +; GCN1-NEXT: .LBB51_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB51_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB51_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v0, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v4, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v8, v6, v2 +; GCN1-NEXT: v_not_b32_e32 v5, v4 +; GCN1-NEXT: v_not_b32_e32 v4, v8 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB51_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB51_2 +; GCN1-NEXT: .LBB51_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN1-NEXT: v_not_b32_e32 v2, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i64_noret_offset: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB51_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB51_6 +; GCN2-NEXT: .LBB51_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB51_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB51_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v4, v7, v3 +; GCN2-NEXT: v_and_b32_e32 v8, v6, v2 +; GCN2-NEXT: v_not_b32_e32 v5, v4 +; GCN2-NEXT: v_not_b32_e32 v4, v8 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB51_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB51_2 +; GCN2-NEXT: .LBB51_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN2-NEXT: v_not_b32_e32 v2, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_nand_i64_noret_offset: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB51_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB51_6 +; GCN3-NEXT: .LBB51_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB51_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB51_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 +; GCN3-NEXT: v_and_b32_e32 v8, v6, v2 +; GCN3-NEXT: v_not_b32_e32 v5, v4 +; GCN3-NEXT: v_not_b32_e32 v4, v8 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB51_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB51_2 +; GCN3-NEXT: .LBB51_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN3-NEXT: v_not_b32_e32 v2, v2 +; GCN3-NEXT: v_not_b32_e32 v1, v1 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst + ret void +} + +define void @flat_atomic_nand_i64_noret_offset__noalias_private(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_nand_i64_noret_offset__noalias_private: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v0, v7, v3 ; GCN1-NEXT: v_and_b32_e32 v1, v6, v2 ; GCN1-NEXT: v_not_b32_e32 v5, v0 ; GCN1-NEXT: v_not_b32_e32 v4, v1 @@ -1928,12 +6607,12 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB51_1 +; GCN1-NEXT: s_cbranch_execnz .LBB52_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_nand_i64_noret_offset: +; GCN2-LABEL: flat_atomic_nand_i64_noret_offset__noalias_private: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 @@ -1943,7 +6622,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB51_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB52_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, v7, v3 @@ -1958,17 +6637,17 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB51_1 +; GCN2-NEXT: s_cbranch_execnz .LBB52_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_nand_i64_noret_offset: +; GCN3-LABEL: flat_atomic_nand_i64_noret_offset__noalias_private: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB51_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB52_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1983,12 +6662,12 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB51_1 +; GCN3-NEXT: s_cbranch_execnz .LBB52_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -1996,12 +6675,21 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_nand_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB53_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[4:5] ; GCN1-NEXT: flat_load_dword v4, v[0:1] -; GCN1-NEXT: flat_load_dword v5, v[5:6] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB53_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -2014,24 +6702,56 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB52_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB53_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: .LBB53_4: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB53_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN1-NEXT: v_not_b32_e32 v2, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: .LBB53_6: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: v_mov_b32_e32 v1, v5 +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 -; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB53_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[4:5] ; GCN2-NEXT: flat_load_dword v4, v[0:1] -; GCN2-NEXT: flat_load_dword v5, v[5:6] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB53_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -2044,40 +6764,96 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB52_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB53_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: .LBB53_4: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB53_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN2-NEXT: v_not_b32_e32 v2, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: .LBB53_6: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: v_mov_b32_e32 v1, v5 +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN3-NEXT: v_mov_b32_e32 v5, v1 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB53_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB53_6 +; GCN3-NEXT: .LBB53_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB53_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB53_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 -; GCN3-NEXT: v_and_b32_e32 v8, v6, v2 -; GCN3-NEXT: v_not_b32_e32 v5, v4 -; GCN3-NEXT: v_not_b32_e32 v4, v8 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_and_b32_e32 v0, v9, v3 +; GCN3-NEXT: v_and_b32_e32 v1, v8, v2 +; GCN3-NEXT: v_not_b32_e32 v7, v0 +; GCN3-NEXT: v_not_b32_e32 v6, v1 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB52_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB53_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB53_2 +; GCN3-NEXT: .LBB53_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN3-NEXT: v_not_b32_e32 v2, v2 +; GCN3-NEXT: v_not_b32_e32 v3, v3 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr %ptr, i64 %in seq_cst ret i64 %result @@ -2087,6 +6863,203 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_nand_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB54_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB54_6 +; GCN1-NEXT: .LBB54_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB54_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB54_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_and_b32_e32 v0, v9, v3 +; GCN1-NEXT: v_and_b32_e32 v1, v8, v2 +; GCN1-NEXT: v_not_b32_e32 v7, v0 +; GCN1-NEXT: v_not_b32_e32 v6, v1 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB54_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB54_2 +; GCN1-NEXT: .LBB54_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN1-NEXT: v_not_b32_e32 v2, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i64_ret_offset: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB54_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB54_6 +; GCN2-NEXT: .LBB54_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB54_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB54_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_and_b32_e32 v0, v9, v3 +; GCN2-NEXT: v_and_b32_e32 v1, v8, v2 +; GCN2-NEXT: v_not_b32_e32 v7, v0 +; GCN2-NEXT: v_not_b32_e32 v6, v1 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB54_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB54_2 +; GCN2-NEXT: .LBB54_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN2-NEXT: v_not_b32_e32 v2, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_nand_i64_ret_offset: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB54_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB54_6 +; GCN3-NEXT: .LBB54_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB54_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB54_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_and_b32_e32 v0, v9, v3 +; GCN3-NEXT: v_and_b32_e32 v1, v8, v2 +; GCN3-NEXT: v_not_b32_e32 v7, v0 +; GCN3-NEXT: v_not_b32_e32 v6, v1 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB54_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB54_2 +; GCN3-NEXT: .LBB54_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN3-NEXT: v_not_b32_e32 v2, v2 +; GCN3-NEXT: v_not_b32_e32 v3, v3 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw nand ptr %gep, i64 %in seq_cst + ret i64 %result +} + +define i64 @flat_atomic_nand_i64_ret_offset__noalias_private(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_nand_i64_ret_offset__noalias_private: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 @@ -2094,7 +7067,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB53_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -2109,12 +7082,12 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB53_1 +; GCN1-NEXT: s_cbranch_execnz .LBB55_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_nand_i64_ret_offset: +; GCN2-LABEL: flat_atomic_nand_i64_ret_offset__noalias_private: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 @@ -2124,7 +7097,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB53_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -2139,17 +7112,17 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB53_1 +; GCN2-NEXT: s_cbranch_execnz .LBB55_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_nand_i64_ret_offset: +; GCN3-LABEL: flat_atomic_nand_i64_ret_offset__noalias_private: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB53_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -2164,14 +7137,14 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB53_1 +; GCN3-NEXT: s_cbranch_execnz .LBB55_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw nand ptr %gep, i64 %in seq_cst + %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -2179,18 +7152,30 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-LABEL: flat_atomic_nand_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB56_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB56_6 +; GCN1-NEXT: .LBB56_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB56_3: ; %atomicrmw.global ; GCN1-NEXT: s_add_u32 s34, s4, 4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s34 -; GCN1-NEXT: v_mov_b32_e32 v4, s35 -; GCN1-NEXT: flat_load_dword v2, v[0:1] -; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: v_mov_b32_e32 v4, s4 -; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB54_1: ; %atomicrmw.start +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB56_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2205,26 +7190,57 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB54_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_cbranch_execnz .LBB56_4 +; GCN1-NEXT: ; %bb.5: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB56_2 +; GCN1-NEXT: .LBB56_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: v_not_b32_e32 v2, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB56_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB56_6 +; GCN2-NEXT: .LBB56_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB56_3: ; %atomicrmw.global ; GCN2-NEXT: s_add_u32 s34, s4, 4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s34 -; GCN2-NEXT: v_mov_b32_e32 v4, s35 -; GCN2-NEXT: flat_load_dword v2, v[0:1] -; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: v_mov_b32_e32 v4, s4 -; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB54_1: ; %atomicrmw.start +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB56_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2239,21 +7255,49 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB54_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_cbranch_execnz .LBB56_4 +; GCN2-NEXT: ; %bb.5: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB56_2 +; GCN2-NEXT: .LBB56_6: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: v_not_b32_e32 v2, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB56_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB56_6 +; GCN3-NEXT: .LBB56_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB56_3: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v4, s4 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB54_1: ; %atomicrmw.start +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB56_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2268,9 +7312,25 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB54_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_cbranch_execnz .LBB56_4 +; GCN3-NEXT: ; %bb.5: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB56_2 +; GCN3-NEXT: .LBB56_6: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN3-NEXT: v_not_b32_e32 v2, v2 +; GCN3-NEXT: v_not_b32_e32 v1, v1 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst ret void @@ -2280,6 +7340,201 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_nand_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 +; GCN1-NEXT: s_add_u32 s34, s4, 32 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB57_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccnz .LBB57_6 +; GCN1-NEXT: .LBB57_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB57_3: ; %atomicrmw.global +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB57_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN1-NEXT: v_not_b32_e32 v1, v0 +; GCN1-NEXT: v_not_b32_e32 v0, v6 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB57_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB57_2 +; GCN1-NEXT: .LBB57_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN1-NEXT: v_not_b32_e32 v2, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i64_noret_offset_scalar: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 +; GCN2-NEXT: s_add_u32 s34, s4, 32 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB57_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccnz .LBB57_6 +; GCN2-NEXT: .LBB57_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB57_3: ; %atomicrmw.global +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB57_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN2-NEXT: v_not_b32_e32 v1, v0 +; GCN2-NEXT: v_not_b32_e32 v0, v6 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB57_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB57_2 +; GCN2-NEXT: .LBB57_6: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, s7, v3 +; GCN2-NEXT: v_not_b32_e32 v2, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_nand_i64_noret_offset_scalar: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB57_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccnz .LBB57_6 +; GCN3-NEXT: .LBB57_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB57_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB57_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v6, s6, v2 +; GCN3-NEXT: v_not_b32_e32 v1, v0 +; GCN3-NEXT: v_not_b32_e32 v0, v6 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB57_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB57_2 +; GCN3-NEXT: .LBB57_6: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, s6, v2 +; GCN3-NEXT: v_not_b32_e32 v2, v2 +; GCN3-NEXT: v_not_b32_e32 v1, v1 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst + ret void +} + +define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar__noalias_addrspace(ptr inreg %out, i64 inreg %in) { +; GCN1-LABEL: flat_atomic_nand_i64_noret_offset_scalar__noalias_addrspace: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: s_add_u32 s36, s4, 36 @@ -2291,7 +7546,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB58_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2306,12 +7561,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB55_1 +; GCN1-NEXT: s_cbranch_execnz .LBB58_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_nand_i64_noret_offset_scalar: +; GCN2-LABEL: flat_atomic_nand_i64_noret_offset_scalar__noalias_addrspace: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 @@ -2325,7 +7580,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB58_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2340,12 +7595,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB55_1 +; GCN2-NEXT: s_cbranch_execnz .LBB58_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_nand_i64_noret_offset_scalar: +; GCN3-LABEL: flat_atomic_nand_i64_noret_offset_scalar__noalias_addrspace: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 @@ -2354,7 +7609,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB58_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2369,12 +7624,12 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB55_1 +; GCN3-NEXT: s_cbranch_execnz .LBB58_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -2382,18 +7637,24 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-LABEL: flat_atomic_nand_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB59_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN1-NEXT: s_add_u32 s34, s4, 4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_load_dword v0, v[0:1] -; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB59_2: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v1 @@ -2408,26 +7669,55 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB56_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_cbranch_execnz .LBB59_2 +; GCN1-NEXT: ; %bb.3: ; %Flow ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB59_6 +; GCN1-NEXT: .LBB59_4: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_cbranch_execz .LBB59_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, s7, v1 +; GCN1-NEXT: v_not_b32_e32 v4, v4 +; GCN1-NEXT: v_not_b32_e32 v5, v5 +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB59_6: ; %atomicrmw.phi +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB59_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN2-NEXT: s_add_u32 s34, s4, 4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_load_dword v0, v[0:1] -; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB59_2: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v1 @@ -2442,21 +7732,47 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB56_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_cbranch_execnz .LBB59_2 +; GCN2-NEXT: ; %bb.3: ; %Flow ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB59_6 +; GCN2-NEXT: .LBB59_4: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_cbranch_execz .LBB59_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, s7, v1 +; GCN2-NEXT: v_not_b32_e32 v4, v4 +; GCN2-NEXT: v_not_b32_e32 v5, v5 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB59_6: ; %atomicrmw.phi +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB59_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global ; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB59_2: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v1 @@ -2471,9 +7787,29 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB56_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_cbranch_execnz .LBB59_2 +; GCN3-NEXT: ; %bb.3: ; %Flow ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB59_6 +; GCN3-NEXT: .LBB59_4: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_cbranch_execz .LBB59_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN3-NEXT: v_not_b32_e32 v4, v4 +; GCN3-NEXT: v_not_b32_e32 v3, v3 +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB59_6: ; %atomicrmw.phi +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand ptr %ptr, i64 %in seq_cst ret i64 %result @@ -2483,6 +7819,195 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-LABEL: flat_atomic_nand_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 +; GCN1-NEXT: s_add_u32 s34, s4, 32 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB60_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB60_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_and_b32_e32 v0, s7, v7 +; GCN1-NEXT: v_and_b32_e32 v1, s6, v6 +; GCN1-NEXT: v_not_b32_e32 v5, v0 +; GCN1-NEXT: v_not_b32_e32 v4, v1 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB60_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB60_6 +; GCN1-NEXT: .LBB60_4: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_cbranch_execz .LBB60_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, s7, v1 +; GCN1-NEXT: v_not_b32_e32 v4, v4 +; GCN1-NEXT: v_not_b32_e32 v5, v5 +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB60_6: ; %atomicrmw.phi +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i64_ret_offset_scalar: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 +; GCN2-NEXT: s_add_u32 s34, s4, 32 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB60_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB60_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_and_b32_e32 v0, s7, v7 +; GCN2-NEXT: v_and_b32_e32 v1, s6, v6 +; GCN2-NEXT: v_not_b32_e32 v5, v0 +; GCN2-NEXT: v_not_b32_e32 v4, v1 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB60_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB60_6 +; GCN2-NEXT: .LBB60_4: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_cbranch_execz .LBB60_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, s7, v1 +; GCN2-NEXT: v_not_b32_e32 v4, v4 +; GCN2-NEXT: v_not_b32_e32 v5, v5 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB60_6: ; %atomicrmw.phi +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_nand_i64_ret_offset_scalar: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB60_4 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB60_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_and_b32_e32 v0, s7, v7 +; GCN3-NEXT: v_and_b32_e32 v1, s6, v6 +; GCN3-NEXT: v_not_b32_e32 v5, v0 +; GCN3-NEXT: v_not_b32_e32 v4, v1 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB60_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB60_6 +; GCN3-NEXT: .LBB60_4: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_cbranch_execz .LBB60_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN3-NEXT: v_not_b32_e32 v4, v4 +; GCN3-NEXT: v_not_b32_e32 v3, v3 +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB60_6: ; %atomicrmw.phi +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw nand ptr %gep, i64 %in seq_cst + ret i64 %result +} + +define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar__noalias_private(ptr inreg %out, i64 inreg %in) { +; GCN1-LABEL: flat_atomic_nand_i64_ret_offset_scalar__noalias_private: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: s_add_u32 s36, s4, 36 @@ -2494,7 +8019,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB57_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB61_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v1 @@ -2509,12 +8034,12 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB57_1 +; GCN1-NEXT: s_cbranch_execnz .LBB61_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_nand_i64_ret_offset_scalar: +; GCN2-LABEL: flat_atomic_nand_i64_ret_offset_scalar__noalias_private: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 @@ -2528,7 +8053,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB57_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB61_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v1 @@ -2543,12 +8068,12 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB57_1 +; GCN2-NEXT: s_cbranch_execnz .LBB61_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_nand_i64_ret_offset_scalar: +; GCN3-LABEL: flat_atomic_nand_i64_ret_offset_scalar__noalias_private: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 @@ -2557,7 +8082,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB61_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v1 @@ -2572,12 +8097,12 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB57_1 +; GCN3-NEXT: s_cbranch_execnz .LBB61_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw nand ptr %gep, i64 %in seq_cst + %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -2585,86 +8110,190 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v7, v[0:1] -; GCN1-NEXT: flat_load_dword v6, v[8:9] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB62_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB62_6 +; GCN1-NEXT: .LBB62_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB62_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB62_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v0, v7, v3 -; GCN1-NEXT: v_and_b32_e32 v1, v6, v2 -; GCN1-NEXT: v_not_b32_e32 v5, v0 -; GCN1-NEXT: v_not_b32_e32 v4, v1 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN1-NEXT: v_and_b32_e32 v4, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v8, v6, v2 +; GCN1-NEXT: v_not_b32_e32 v5, v4 +; GCN1-NEXT: v_not_b32_e32 v4, v8 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN1-NEXT: v_mov_b32_e32 v7, v1 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v6, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB58_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB62_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB62_2 +; GCN1-NEXT: .LBB62_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN1-NEXT: v_not_b32_e32 v2, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v7, v[0:1] -; GCN2-NEXT: flat_load_dword v6, v[8:9] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB62_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB62_6 +; GCN2-NEXT: .LBB62_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB62_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB62_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v0, v7, v3 -; GCN2-NEXT: v_and_b32_e32 v1, v6, v2 -; GCN2-NEXT: v_not_b32_e32 v5, v0 -; GCN2-NEXT: v_not_b32_e32 v4, v1 -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN2-NEXT: v_and_b32_e32 v4, v7, v3 +; GCN2-NEXT: v_and_b32_e32 v8, v6, v2 +; GCN2-NEXT: v_not_b32_e32 v5, v4 +; GCN2-NEXT: v_not_b32_e32 v4, v8 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] -; GCN2-NEXT: v_mov_b32_e32 v7, v1 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v6, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB58_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB62_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB62_2 +; GCN2-NEXT: .LBB62_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN2-NEXT: v_not_b32_e32 v2, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB62_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB62_6 +; GCN3-NEXT: .LBB62_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB62_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB62_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 ; GCN3-NEXT: v_and_b32_e32 v8, v6, v2 ; GCN3-NEXT: v_not_b32_e32 v5, v4 ; GCN3-NEXT: v_not_b32_e32 v4, v8 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB58_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB62_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB62_2 +; GCN3-NEXT: .LBB62_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v4, v2 +; GCN3-NEXT: v_not_b32_e32 v2, v2 +; GCN3-NEXT: v_not_b32_e32 v1, v1 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -2675,14 +8304,29 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB63_3 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB63_6 +; GCN1-NEXT: .LBB63_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB63_3: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB63_4: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -2695,24 +8339,60 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB59_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB63_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB63_2 +; GCN1-NEXT: .LBB63_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN1-NEXT: v_not_b32_e32 v2, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB63_3 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB63_6 +; GCN2-NEXT: .LBB63_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB63_3: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB63_4: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -2725,38 +8405,92 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB59_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB63_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB63_2 +; GCN2-NEXT: .LBB63_6: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN2-NEXT: v_not_b32_e32 v2, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB63_3 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB63_6 +; GCN3-NEXT: .LBB63_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB63_3: ; %atomicrmw.global +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB63_4: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v7, v5 -; GCN3-NEXT: v_mov_b32_e32 v6, v4 -; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 -; GCN3-NEXT: v_and_b32_e32 v8, v6, v2 -; GCN3-NEXT: v_not_b32_e32 v5, v4 -; GCN3-NEXT: v_not_b32_e32 v4, v8 -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_and_b32_e32 v0, v9, v3 +; GCN3-NEXT: v_and_b32_e32 v1, v8, v2 +; GCN3-NEXT: v_not_b32_e32 v7, v0 +; GCN3-NEXT: v_not_b32_e32 v6, v1 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB59_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB63_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB63_2 +; GCN3-NEXT: .LBB63_6: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_and_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN3-NEXT: v_not_b32_e32 v2, v2 +; GCN3-NEXT: v_not_b32_e32 v3, v3 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -2771,25 +8505,118 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_or_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB64_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB64_4 +; GCN1-NEXT: .LBB64_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB64_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB64_2 +; GCN1-NEXT: .LBB64_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB64_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB64_4 +; GCN2-NEXT: .LBB64_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB64_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB64_2 +; GCN2-NEXT: .LBB64_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB64_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB64_4 +; GCN3-NEXT: .LBB64_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB64_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB64_2 +; GCN3-NEXT: .LBB64_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst ret void @@ -2799,29 +8626,124 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_or_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB65_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB65_4 +; GCN1-NEXT: .LBB65_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB65_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB65_2 +; GCN1-NEXT: .LBB65_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB65_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB65_4 +; GCN2-NEXT: .LBB65_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB65_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB65_2 +; GCN2-NEXT: .LBB65_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB65_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB65_4 +; GCN3-NEXT: .LBB65_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB65_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB65_2 +; GCN3-NEXT: .LBB65_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst @@ -2832,25 +8754,127 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_or_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB66_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB66_4 +; GCN1-NEXT: .LBB66_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB66_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB66_2 +; GCN1-NEXT: .LBB66_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB66_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB66_4 +; GCN2-NEXT: .LBB66_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB66_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB66_2 +; GCN2-NEXT: .LBB66_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v5, v1 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB66_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB66_4 +; GCN3-NEXT: .LBB66_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB66_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB66_2 +; GCN3-NEXT: .LBB66_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i64 %in seq_cst ret i64 %result @@ -2860,29 +8884,127 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_or_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB67_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB67_4 +; GCN1-NEXT: .LBB67_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB67_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB67_2 +; GCN1-NEXT: .LBB67_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB67_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB67_4 +; GCN2-NEXT: .LBB67_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB67_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB67_2 +; GCN2-NEXT: .LBB67_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB67_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB67_4 +; GCN3-NEXT: .LBB67_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB67_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB67_2 +; GCN3-NEXT: .LBB67_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst @@ -2893,37 +9015,124 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN1-LABEL: flat_atomic_or_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB68_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB68_4 +; GCN1-NEXT: .LBB68_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB68_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB68_2 +; GCN1-NEXT: .LBB68_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB68_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB68_4 +; GCN2-NEXT: .LBB68_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB68_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB68_2 +; GCN2-NEXT: .LBB68_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB68_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB68_4 +; GCN3-NEXT: .LBB68_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB68_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB68_2 +; GCN3-NEXT: .LBB68_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v1, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst ret void @@ -2933,41 +9142,130 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN1-LABEL: flat_atomic_or_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB69_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB69_4 +; GCN1-NEXT: .LBB69_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB69_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB69_2 +; GCN1-NEXT: .LBB69_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB69_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB69_4 +; GCN2-NEXT: .LBB69_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB69_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB69_2 +; GCN2-NEXT: .LBB69_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB69_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB69_4 +; GCN3-NEXT: .LBB69_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB69_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB69_2 +; GCN3-NEXT: .LBB69_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v1, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, s6, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst @@ -2978,37 +9276,118 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN1-LABEL: flat_atomic_or_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB70_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB70_3 +; GCN1-NEXT: s_branch .LBB70_4 +; GCN1-NEXT: .LBB70_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB70_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v4, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, s7, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB70_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB70_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB70_3 +; GCN2-NEXT: s_branch .LBB70_4 +; GCN2-NEXT: .LBB70_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB70_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v4, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v5, s7, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB70_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB70_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB70_3 +; GCN3-NEXT: s_branch .LBB70_4 +; GCN3-NEXT: .LBB70_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB70_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v3, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v4, s6, v0 +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB70_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i64 %in seq_cst ret i64 %result @@ -3018,41 +9397,124 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-LABEL: flat_atomic_or_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB71_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB71_3 +; GCN1-NEXT: s_branch .LBB71_4 +; GCN1-NEXT: .LBB71_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB71_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v4, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, s7, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB71_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB71_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB71_3 +; GCN2-NEXT: s_branch .LBB71_4 +; GCN2-NEXT: .LBB71_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB71_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v4, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v5, s7, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB71_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB71_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB71_3 +; GCN3-NEXT: s_branch .LBB71_4 +; GCN3-NEXT: .LBB71_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB71_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v3, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v4, s6, v0 +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB71_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst @@ -3063,29 +9525,124 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB72_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB72_4 +; GCN1-NEXT: .LBB72_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB72_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB72_2 +; GCN1-NEXT: .LBB72_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB72_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB72_4 +; GCN2-NEXT: .LBB72_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB72_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB72_2 +; GCN2-NEXT: .LBB72_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB72_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB72_4 +; GCN3-NEXT: .LBB72_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB72_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB72_2 +; GCN3-NEXT: .LBB72_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -3095,30 +9652,128 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB73_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB73_4 +; GCN1-NEXT: .LBB73_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB73_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB73_2 +; GCN1-NEXT: .LBB73_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB73_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB73_4 +; GCN2-NEXT: .LBB73_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB73_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB73_2 +; GCN2-NEXT: .LBB73_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB73_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB73_4 +; GCN3-NEXT: .LBB73_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB73_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB73_2 +; GCN3-NEXT: .LBB73_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_or_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -3133,25 +9788,118 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_xor_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB74_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB74_4 +; GCN1-NEXT: .LBB74_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB74_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB74_2 +; GCN1-NEXT: .LBB74_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB74_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB74_4 +; GCN2-NEXT: .LBB74_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB74_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB74_2 +; GCN2-NEXT: .LBB74_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB74_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB74_4 +; GCN3-NEXT: .LBB74_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB74_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB74_2 +; GCN3-NEXT: .LBB74_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, v4, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst ret void @@ -3161,29 +9909,124 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_xor_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB75_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB75_4 +; GCN1-NEXT: .LBB75_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB75_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB75_2 +; GCN1-NEXT: .LBB75_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB75_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB75_4 +; GCN2-NEXT: .LBB75_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB75_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB75_2 +; GCN2-NEXT: .LBB75_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB75_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB75_4 +; GCN3-NEXT: .LBB75_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB75_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB75_2 +; GCN3-NEXT: .LBB75_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, v4, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst @@ -3194,25 +10037,127 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { ; GCN1-LABEL: flat_atomic_xor_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB76_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB76_4 +; GCN1-NEXT: .LBB76_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB76_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB76_2 +; GCN1-NEXT: .LBB76_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v1, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB76_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB76_4 +; GCN2-NEXT: .LBB76_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB76_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB76_2 +; GCN2-NEXT: .LBB76_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, v1, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v5, v1 +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB76_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB76_4 +; GCN3-NEXT: .LBB76_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB76_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB76_2 +; GCN3-NEXT: .LBB76_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i64 %in seq_cst ret i64 %result @@ -3222,29 +10167,127 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-LABEL: flat_atomic_xor_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB77_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB77_4 +; GCN1-NEXT: .LBB77_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB77_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB77_2 +; GCN1-NEXT: .LBB77_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v1, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB77_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB77_4 +; GCN2-NEXT: .LBB77_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB77_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB77_2 +; GCN2-NEXT: .LBB77_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, v1, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB77_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB77_4 +; GCN3-NEXT: .LBB77_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB77_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB77_2 +; GCN3-NEXT: .LBB77_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst @@ -3255,37 +10298,124 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-LABEL: flat_atomic_xor_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB78_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB78_4 +; GCN1-NEXT: .LBB78_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB78_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB78_2 +; GCN1-NEXT: .LBB78_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB78_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB78_4 +; GCN2-NEXT: .LBB78_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB78_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB78_2 +; GCN2-NEXT: .LBB78_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB78_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB78_4 +; GCN3-NEXT: .LBB78_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB78_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB78_2 +; GCN3-NEXT: .LBB78_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v1, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst ret void @@ -3295,41 +10425,130 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-LABEL: flat_atomic_xor_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cbranch_vccnz .LBB79_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB79_4 +; GCN1-NEXT: .LBB79_2: ; %atomicrmw.phi +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB79_3: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execnz .LBB79_2 +; GCN1-NEXT: .LBB79_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cbranch_vccnz .LBB79_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB79_4 +; GCN2-NEXT: .LBB79_2: ; %atomicrmw.phi +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB79_3: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execnz .LBB79_2 +; GCN2-NEXT: .LBB79_4: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_cbranch_vccnz .LBB79_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB79_4 +; GCN3-NEXT: .LBB79_2: ; %atomicrmw.phi +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB79_3: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execnz .LBB79_2 +; GCN3-NEXT: .LBB79_4: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v1, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, s6, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst @@ -3340,37 +10559,118 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-LABEL: flat_atomic_xor_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s5, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB80_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB80_3 +; GCN1-NEXT: s_branch .LBB80_4 +; GCN1-NEXT: .LBB80_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB80_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s4, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v4, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, s7, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB80_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s5, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB80_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB80_3 +; GCN2-NEXT: s_branch .LBB80_4 +; GCN2-NEXT: .LBB80_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB80_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN2-NEXT: s_cselect_b32 s34, s4, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v4, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v5, s7, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB80_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s5, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB80_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB80_3 +; GCN3-NEXT: s_branch .LBB80_4 +; GCN3-NEXT: .LBB80_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB80_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GCN3-NEXT: s_cselect_b32 s34, s4, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v3, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v4, s6, v0 +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB80_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i64 %in seq_cst ret i64 %result @@ -3380,41 +10680,124 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-LABEL: flat_atomic_xor_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_cmp_eq_u32 s35, s36 +; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccz .LBB81_2 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_cbranch_execz .LBB81_3 +; GCN1-NEXT: s_branch .LBB81_4 +; GCN1-NEXT: .LBB81_2: +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: .LBB81_3: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec +; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_i32 s34, s34, 4 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v4, s6, v0 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, s7, v1 +; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN1-NEXT: .LBB81_4: ; %atomicrmw.end +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 +; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_cmp_eq_u32 s35, s36 +; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccz .LBB81_2 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_cbranch_execz .LBB81_3 +; GCN2-NEXT: s_branch .LBB81_4 +; GCN2-NEXT: .LBB81_2: +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: .LBB81_3: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_i32 s34, s34, 4 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v4, s6, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v5, s7, v1 +; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen +; GCN2-NEXT: .LBB81_4: ; %atomicrmw.end +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: s_add_u32 s34, s4, 32 +; GCN3-NEXT: s_addc_u32 s35, s5, 0 +; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s35, s37 +; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccz .LBB81_2 +; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN3-NEXT: v_mov_b32_e32 v0, s34 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s35 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_cbranch_execz .LBB81_3 +; GCN3-NEXT: s_branch .LBB81_4 +; GCN3-NEXT: .LBB81_2: +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: .LBB81_3: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 +; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v3, s7, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v4, s6, v0 +; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: .LBB81_4: ; %atomicrmw.end +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst @@ -3425,29 +10808,124 @@ define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB82_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB82_4 +; GCN1-NEXT: .LBB82_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB82_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB82_2 +; GCN1-NEXT: .LBB82_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB82_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB82_4 +; GCN2-NEXT: .LBB82_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB82_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB82_2 +; GCN2-NEXT: .LBB82_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB82_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB82_4 +; GCN3-NEXT: .LBB82_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB82_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB82_2 +; GCN3-NEXT: .LBB82_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v1, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, v4, v2 +; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -3458,29 +10936,127 @@ define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB83_3 +; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB83_4 +; GCN1-NEXT: .LBB83_2: ; %atomicrmw.phi +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; GCN1-NEXT: .LBB83_3: ; %atomicrmw.global +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB83_2 +; GCN1-NEXT: .LBB83_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 +; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v1, v3 +; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 +; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB83_3 +; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB83_4 +; GCN2-NEXT: .LBB83_2: ; %atomicrmw.phi +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; GCN2-NEXT: .LBB83_3: ; %atomicrmw.global +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB83_2 +; GCN2-NEXT: .LBB83_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 +; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, v1, v3 +; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base +; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB83_3 +; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB83_4 +; GCN3-NEXT: .LBB83_2: ; %atomicrmw.phi +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] +; GCN3-NEXT: .LBB83_3: ; %atomicrmw.global +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN3-NEXT: s_cbranch_execz .LBB83_2 +; GCN3-NEXT: .LBB83_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_xor_b32_e32 v3, v1, v3 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -3500,7 +11076,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB80_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3514,7 +11090,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB80_1 +; GCN1-NEXT: s_cbranch_execnz .LBB84_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3527,7 +11103,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB80_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3541,7 +11117,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB80_1 +; GCN2-NEXT: s_cbranch_execnz .LBB84_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3551,7 +11127,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB80_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3565,11 +11141,11 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB80_1 +; GCN3-NEXT: s_cbranch_execnz .LBB84_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -3584,7 +11160,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB81_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3598,7 +11174,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB81_1 +; GCN1-NEXT: s_cbranch_execnz .LBB85_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3613,7 +11189,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB81_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3627,7 +11203,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB81_1 +; GCN2-NEXT: s_cbranch_execnz .LBB85_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3637,7 +11213,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB81_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3651,12 +11227,12 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB81_1 +; GCN3-NEXT: s_cbranch_execnz .LBB85_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -3669,7 +11245,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: flat_load_dword v5, v[5:6] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB82_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -3683,7 +11259,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB82_1 +; GCN1-NEXT: s_cbranch_execnz .LBB86_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 @@ -3698,7 +11274,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: flat_load_dword v5, v[5:6] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB82_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -3712,7 +11288,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB82_1 +; GCN2-NEXT: s_cbranch_execnz .LBB86_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 @@ -3724,7 +11300,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB82_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -3738,13 +11314,13 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB82_1 +; GCN3-NEXT: s_cbranch_execnz .LBB86_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw max ptr %ptr, i64 %in seq_cst + %result = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -3759,7 +11335,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB83_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -3773,7 +11349,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB83_1 +; GCN1-NEXT: s_cbranch_execnz .LBB87_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3788,7 +11364,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB83_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -3802,7 +11378,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB83_1 +; GCN2-NEXT: s_cbranch_execnz .LBB87_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3812,7 +11388,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB83_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -3826,14 +11402,14 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB83_1 +; GCN3-NEXT: s_cbranch_execnz .LBB87_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw max ptr %gep, i64 %in seq_cst + %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -3854,7 +11430,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3868,7 +11444,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB84_1 +; GCN1-NEXT: s_cbranch_execnz .LBB88_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3889,7 +11465,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3903,7 +11479,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB84_1 +; GCN2-NEXT: s_cbranch_execnz .LBB88_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3919,7 +11495,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3933,11 +11509,11 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB84_1 +; GCN3-NEXT: s_cbranch_execnz .LBB88_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -3958,7 +11534,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3972,7 +11548,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB85_1 +; GCN1-NEXT: s_cbranch_execnz .LBB89_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3993,7 +11569,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -4007,7 +11583,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB85_1 +; GCN2-NEXT: s_cbranch_execnz .LBB89_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4023,7 +11599,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -4037,12 +11613,12 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB85_1 +; GCN3-NEXT: s_cbranch_execnz .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -4063,7 +11639,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -4077,7 +11653,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB86_1 +; GCN1-NEXT: s_cbranch_execnz .LBB90_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4098,7 +11674,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -4112,7 +11688,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB86_1 +; GCN2-NEXT: s_cbranch_execnz .LBB90_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4128,7 +11704,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -4142,11 +11718,11 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB86_1 +; GCN3-NEXT: s_cbranch_execnz .LBB90_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw max ptr %ptr, i64 %in seq_cst + %result = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -4167,7 +11743,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -4181,7 +11757,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB87_1 +; GCN1-NEXT: s_cbranch_execnz .LBB91_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4202,7 +11778,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -4216,7 +11792,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB87_1 +; GCN2-NEXT: s_cbranch_execnz .LBB91_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4232,7 +11808,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -4246,12 +11822,12 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB87_1 +; GCN3-NEXT: s_cbranch_execnz .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw max ptr %gep, i64 %in seq_cst + %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -4272,7 +11848,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB92_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -4286,7 +11862,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB88_1 +; GCN1-NEXT: s_cbranch_execnz .LBB92_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -4306,7 +11882,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB92_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -4320,7 +11896,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB88_1 +; GCN2-NEXT: s_cbranch_execnz .LBB92_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -4338,7 +11914,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB92_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -4352,13 +11928,13 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB88_1 +; GCN3-NEXT: s_cbranch_execnz .LBB92_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -4378,7 +11954,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB93_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -4392,7 +11968,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB89_1 +; GCN1-NEXT: s_cbranch_execnz .LBB93_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -4415,7 +11991,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB93_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -4429,7 +12005,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB89_1 +; GCN2-NEXT: s_cbranch_execnz .LBB93_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -4450,7 +12026,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s9 ; GCN3-NEXT: v_mov_b32_e32 v5, s8 -; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB93_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -4464,7 +12040,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB89_1 +; GCN3-NEXT: s_cbranch_execnz .LBB93_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -4474,7 +12050,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void } @@ -4494,7 +12070,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB94_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -4508,7 +12084,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB90_1 +; GCN1-NEXT: s_cbranch_execnz .LBB94_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -4526,7 +12102,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB94_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -4540,7 +12116,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB90_1 +; GCN2-NEXT: s_cbranch_execnz .LBB94_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -4558,7 +12134,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB94_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -4572,12 +12148,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB90_1 +; GCN3-NEXT: s_cbranch_execnz .LBB94_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -4595,7 +12171,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB95_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -4609,7 +12185,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB91_1 +; GCN1-NEXT: s_cbranch_execnz .LBB95_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -4630,7 +12206,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB95_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -4644,7 +12220,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB91_1 +; GCN2-NEXT: s_cbranch_execnz .LBB95_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -4665,7 +12241,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s9 ; GCN3-NEXT: v_mov_b32_e32 v5, s8 -; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB95_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -4679,7 +12255,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB91_1 +; GCN3-NEXT: s_cbranch_execnz .LBB95_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -4688,7 +12264,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void } @@ -4704,7 +12280,7 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -4718,7 +12294,7 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB92_1 +; GCN1-NEXT: s_cbranch_execnz .LBB96_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4733,7 +12309,7 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -4747,7 +12323,7 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB92_1 +; GCN2-NEXT: s_cbranch_execnz .LBB96_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4757,7 +12333,7 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -4771,12 +12347,12 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB92_1 +; GCN3-NEXT: s_cbranch_execnz .LBB96_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret void } @@ -4791,7 +12367,7 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -4805,7 +12381,7 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB93_1 +; GCN1-NEXT: s_cbranch_execnz .LBB97_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4820,7 +12396,7 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -4834,7 +12410,7 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB93_1 +; GCN2-NEXT: s_cbranch_execnz .LBB97_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4844,7 +12420,7 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -4858,14 +12434,14 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB93_1 +; GCN3-NEXT: s_cbranch_execnz .LBB97_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret i64 %result } @@ -4882,7 +12458,7 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -4896,7 +12472,7 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB94_1 +; GCN1-NEXT: s_cbranch_execnz .LBB98_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4909,7 +12485,7 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -4923,7 +12499,7 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB94_1 +; GCN2-NEXT: s_cbranch_execnz .LBB98_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4933,7 +12509,7 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -4947,11 +12523,11 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB94_1 +; GCN3-NEXT: s_cbranch_execnz .LBB98_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -4966,7 +12542,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -4980,7 +12556,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB95_1 +; GCN1-NEXT: s_cbranch_execnz .LBB99_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4995,7 +12571,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -5009,7 +12585,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB95_1 +; GCN2-NEXT: s_cbranch_execnz .LBB99_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5019,7 +12595,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -5033,12 +12609,12 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB95_1 +; GCN3-NEXT: s_cbranch_execnz .LBB99_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -5051,7 +12627,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: flat_load_dword v5, v[5:6] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -5065,7 +12641,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB96_1 +; GCN1-NEXT: s_cbranch_execnz .LBB100_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 @@ -5080,7 +12656,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: flat_load_dword v5, v[5:6] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -5094,7 +12670,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB96_1 +; GCN2-NEXT: s_cbranch_execnz .LBB100_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 @@ -5106,7 +12682,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -5120,13 +12696,13 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB96_1 +; GCN3-NEXT: s_cbranch_execnz .LBB100_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw umax ptr %ptr, i64 %in seq_cst + %result = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -5141,7 +12717,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -5155,7 +12731,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB97_1 +; GCN1-NEXT: s_cbranch_execnz .LBB101_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5170,7 +12746,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -5184,7 +12760,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB97_1 +; GCN2-NEXT: s_cbranch_execnz .LBB101_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5194,7 +12770,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -5208,14 +12784,14 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB97_1 +; GCN3-NEXT: s_cbranch_execnz .LBB101_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw umax ptr %gep, i64 %in seq_cst + %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -5236,7 +12812,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5250,7 +12826,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB98_1 +; GCN1-NEXT: s_cbranch_execnz .LBB102_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5271,7 +12847,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5285,7 +12861,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB98_1 +; GCN2-NEXT: s_cbranch_execnz .LBB102_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5301,7 +12877,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5315,11 +12891,11 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB98_1 +; GCN3-NEXT: s_cbranch_execnz .LBB102_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -5340,7 +12916,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5354,7 +12930,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB99_1 +; GCN1-NEXT: s_cbranch_execnz .LBB103_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5375,7 +12951,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5389,7 +12965,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB99_1 +; GCN2-NEXT: s_cbranch_execnz .LBB103_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5405,7 +12981,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5419,12 +12995,12 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB99_1 +; GCN3-NEXT: s_cbranch_execnz .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -5445,7 +13021,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -5459,7 +13035,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB100_1 +; GCN1-NEXT: s_cbranch_execnz .LBB104_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5480,7 +13056,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -5494,7 +13070,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB100_1 +; GCN2-NEXT: s_cbranch_execnz .LBB104_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5510,7 +13086,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -5524,11 +13100,11 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB100_1 +; GCN3-NEXT: s_cbranch_execnz .LBB104_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw umax ptr %ptr, i64 %in seq_cst + %result = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -5549,7 +13125,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB105_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -5563,7 +13139,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB101_1 +; GCN1-NEXT: s_cbranch_execnz .LBB105_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5584,7 +13160,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB105_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -5598,7 +13174,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB101_1 +; GCN2-NEXT: s_cbranch_execnz .LBB105_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5614,7 +13190,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB105_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -5628,12 +13204,12 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB101_1 +; GCN3-NEXT: s_cbranch_execnz .LBB105_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw umax ptr %gep, i64 %in seq_cst + %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -5654,7 +13230,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB106_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -5668,7 +13244,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB102_1 +; GCN1-NEXT: s_cbranch_execnz .LBB106_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -5688,7 +13264,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB106_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -5702,7 +13278,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB102_1 +; GCN2-NEXT: s_cbranch_execnz .LBB106_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -5720,7 +13296,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB106_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5734,13 +13310,13 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB102_1 +; GCN3-NEXT: s_cbranch_execnz .LBB106_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -5760,7 +13336,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -5774,7 +13350,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB103_1 +; GCN1-NEXT: s_cbranch_execnz .LBB107_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -5797,7 +13373,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -5811,7 +13387,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB103_1 +; GCN2-NEXT: s_cbranch_execnz .LBB107_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -5832,7 +13408,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s9 ; GCN3-NEXT: v_mov_b32_e32 v5, s8 -; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -5846,7 +13422,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB103_1 +; GCN3-NEXT: s_cbranch_execnz .LBB107_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -5856,7 +13432,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void } @@ -5875,7 +13451,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -5889,7 +13465,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB104_1 +; GCN1-NEXT: s_cbranch_execnz .LBB108_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -5910,7 +13486,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -5924,7 +13500,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB104_1 +; GCN2-NEXT: s_cbranch_execnz .LBB108_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -5945,7 +13521,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s9 ; GCN3-NEXT: v_mov_b32_e32 v5, s8 -; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -5959,7 +13535,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB104_1 +; GCN3-NEXT: s_cbranch_execnz .LBB108_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -5968,7 +13544,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void } @@ -5984,7 +13560,7 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -5998,7 +13574,7 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB105_1 +; GCN1-NEXT: s_cbranch_execnz .LBB109_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6013,7 +13589,7 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -6027,7 +13603,7 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB105_1 +; GCN2-NEXT: s_cbranch_execnz .LBB109_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6037,7 +13613,7 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -6051,12 +13627,12 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB105_1 +; GCN3-NEXT: s_cbranch_execnz .LBB109_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret void } @@ -6071,7 +13647,7 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -6085,7 +13661,7 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB106_1 +; GCN1-NEXT: s_cbranch_execnz .LBB110_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6100,7 +13676,7 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -6114,7 +13690,7 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB106_1 +; GCN2-NEXT: s_cbranch_execnz .LBB110_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6124,7 +13700,7 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -6138,14 +13714,14 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB106_1 +; GCN3-NEXT: s_cbranch_execnz .LBB110_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret i64 %result } @@ -6162,7 +13738,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -6176,7 +13752,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB107_1 +; GCN1-NEXT: s_cbranch_execnz .LBB111_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6189,7 +13765,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -6203,7 +13779,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB107_1 +; GCN2-NEXT: s_cbranch_execnz .LBB111_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6213,7 +13789,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -6227,11 +13803,11 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB107_1 +; GCN3-NEXT: s_cbranch_execnz .LBB111_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -6246,7 +13822,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -6260,7 +13836,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB108_1 +; GCN1-NEXT: s_cbranch_execnz .LBB112_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6275,7 +13851,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -6289,7 +13865,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB108_1 +; GCN2-NEXT: s_cbranch_execnz .LBB112_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6299,7 +13875,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -6313,12 +13889,12 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB108_1 +; GCN3-NEXT: s_cbranch_execnz .LBB112_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -6331,7 +13907,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: flat_load_dword v5, v[5:6] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -6345,7 +13921,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB109_1 +; GCN1-NEXT: s_cbranch_execnz .LBB113_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 @@ -6360,7 +13936,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: flat_load_dword v5, v[5:6] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -6374,7 +13950,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB109_1 +; GCN2-NEXT: s_cbranch_execnz .LBB113_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 @@ -6386,7 +13962,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -6400,13 +13976,13 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB109_1 +; GCN3-NEXT: s_cbranch_execnz .LBB113_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw umin ptr %ptr, i64 %in seq_cst + %result = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -6421,7 +13997,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -6435,7 +14011,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB110_1 +; GCN1-NEXT: s_cbranch_execnz .LBB114_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6450,7 +14026,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -6464,7 +14040,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB110_1 +; GCN2-NEXT: s_cbranch_execnz .LBB114_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6474,7 +14050,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -6488,14 +14064,14 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB110_1 +; GCN3-NEXT: s_cbranch_execnz .LBB114_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw umin ptr %gep, i64 %in seq_cst + %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -6516,7 +14092,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB115_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6530,7 +14106,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB111_1 +; GCN1-NEXT: s_cbranch_execnz .LBB115_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6551,7 +14127,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB115_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6565,7 +14141,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB111_1 +; GCN2-NEXT: s_cbranch_execnz .LBB115_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6581,7 +14157,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB115_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6595,11 +14171,11 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB111_1 +; GCN3-NEXT: s_cbranch_execnz .LBB115_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -6620,7 +14196,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB116_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6634,7 +14210,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB112_1 +; GCN1-NEXT: s_cbranch_execnz .LBB116_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6655,7 +14231,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB116_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6669,7 +14245,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB112_1 +; GCN2-NEXT: s_cbranch_execnz .LBB116_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6685,7 +14261,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB116_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6699,12 +14275,12 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB112_1 +; GCN3-NEXT: s_cbranch_execnz .LBB116_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -6725,7 +14301,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB117_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -6739,7 +14315,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB113_1 +; GCN1-NEXT: s_cbranch_execnz .LBB117_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6760,7 +14336,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB117_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -6774,7 +14350,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB113_1 +; GCN2-NEXT: s_cbranch_execnz .LBB117_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6790,7 +14366,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB117_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -6804,11 +14380,11 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB113_1 +; GCN3-NEXT: s_cbranch_execnz .LBB117_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw umin ptr %ptr, i64 %in seq_cst + %result = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -6829,7 +14405,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -6843,7 +14419,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB114_1 +; GCN1-NEXT: s_cbranch_execnz .LBB118_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6864,7 +14440,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -6878,7 +14454,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB114_1 +; GCN2-NEXT: s_cbranch_execnz .LBB118_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6894,7 +14470,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -6908,12 +14484,12 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB114_1 +; GCN3-NEXT: s_cbranch_execnz .LBB118_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw umin ptr %gep, i64 %in seq_cst + %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -6928,7 +14504,7 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -6942,7 +14518,7 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB115_1 +; GCN1-NEXT: s_cbranch_execnz .LBB119_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6957,7 +14533,7 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -6971,7 +14547,7 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB115_1 +; GCN2-NEXT: s_cbranch_execnz .LBB119_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6981,7 +14557,7 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -6995,12 +14571,12 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB115_1 +; GCN3-NEXT: s_cbranch_execnz .LBB119_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret void } @@ -7015,7 +14591,7 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -7029,7 +14605,7 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB116_1 +; GCN1-NEXT: s_cbranch_execnz .LBB120_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -7044,7 +14620,7 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -7058,7 +14634,7 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB116_1 +; GCN2-NEXT: s_cbranch_execnz .LBB120_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -7068,7 +14644,7 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -7082,14 +14658,14 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB116_1 +; GCN3-NEXT: s_cbranch_execnz .LBB120_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret i64 %result } @@ -7106,7 +14682,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -7120,7 +14696,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB117_1 +; GCN1-NEXT: s_cbranch_execnz .LBB121_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -7133,7 +14709,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -7147,7 +14723,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB117_1 +; GCN2-NEXT: s_cbranch_execnz .LBB121_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -7157,7 +14733,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -7171,11 +14747,11 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB117_1 +; GCN3-NEXT: s_cbranch_execnz .LBB121_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7190,7 +14766,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -7204,7 +14780,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB118_1 +; GCN1-NEXT: s_cbranch_execnz .LBB122_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -7219,7 +14795,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -7233,7 +14809,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB118_1 +; GCN2-NEXT: s_cbranch_execnz .LBB122_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -7243,7 +14819,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -7257,12 +14833,12 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB118_1 +; GCN3-NEXT: s_cbranch_execnz .LBB122_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7275,7 +14851,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: flat_load_dword v5, v[5:6] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -7289,7 +14865,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB119_1 +; GCN1-NEXT: s_cbranch_execnz .LBB123_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 @@ -7304,7 +14880,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: flat_load_dword v5, v[5:6] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -7318,7 +14894,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB119_1 +; GCN2-NEXT: s_cbranch_execnz .LBB123_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 @@ -7330,7 +14906,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -7344,13 +14920,13 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB119_1 +; GCN3-NEXT: s_cbranch_execnz .LBB123_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw min ptr %ptr, i64 %in seq_cst + %result = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -7365,7 +14941,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -7379,7 +14955,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB120_1 +; GCN1-NEXT: s_cbranch_execnz .LBB124_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -7394,7 +14970,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -7408,7 +14984,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB120_1 +; GCN2-NEXT: s_cbranch_execnz .LBB124_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -7418,7 +14994,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -7432,14 +15008,14 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB120_1 +; GCN3-NEXT: s_cbranch_execnz .LBB124_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw min ptr %gep, i64 %in seq_cst + %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -7460,7 +15036,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7474,7 +15050,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB121_1 +; GCN1-NEXT: s_cbranch_execnz .LBB125_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -7495,7 +15071,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7509,7 +15085,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB121_1 +; GCN2-NEXT: s_cbranch_execnz .LBB125_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -7525,7 +15101,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7539,11 +15115,11 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB121_1 +; GCN3-NEXT: s_cbranch_execnz .LBB125_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7564,7 +15140,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7578,7 +15154,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB122_1 +; GCN1-NEXT: s_cbranch_execnz .LBB126_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -7599,7 +15175,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7613,7 +15189,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB122_1 +; GCN2-NEXT: s_cbranch_execnz .LBB126_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -7629,7 +15205,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7643,12 +15219,12 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB122_1 +; GCN3-NEXT: s_cbranch_execnz .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7669,7 +15245,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -7683,7 +15259,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB123_1 +; GCN1-NEXT: s_cbranch_execnz .LBB127_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -7704,7 +15280,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -7718,7 +15294,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB123_1 +; GCN2-NEXT: s_cbranch_execnz .LBB127_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -7734,7 +15310,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -7748,11 +15324,11 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB123_1 +; GCN3-NEXT: s_cbranch_execnz .LBB127_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw min ptr %ptr, i64 %in seq_cst + %result = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -7773,7 +15349,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB124_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -7787,7 +15363,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB124_1 +; GCN1-NEXT: s_cbranch_execnz .LBB128_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -7808,7 +15384,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB124_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -7822,7 +15398,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB124_1 +; GCN2-NEXT: s_cbranch_execnz .LBB128_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -7838,7 +15414,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -7852,12 +15428,12 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB124_1 +; GCN3-NEXT: s_cbranch_execnz .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw min ptr %gep, i64 %in seq_cst + %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -7878,7 +15454,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB125_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB129_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -7892,7 +15468,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB125_1 +; GCN1-NEXT: s_cbranch_execnz .LBB129_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -7912,7 +15488,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB129_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -7926,7 +15502,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB125_1 +; GCN2-NEXT: s_cbranch_execnz .LBB129_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -7944,7 +15520,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB129_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7958,13 +15534,13 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB125_1 +; GCN3-NEXT: s_cbranch_execnz .LBB129_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7984,7 +15560,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB126_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB130_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -7998,7 +15574,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB126_1 +; GCN1-NEXT: s_cbranch_execnz .LBB130_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -8021,7 +15597,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB130_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -8035,7 +15611,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB126_1 +; GCN2-NEXT: s_cbranch_execnz .LBB130_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -8056,7 +15632,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s9 ; GCN3-NEXT: v_mov_b32_e32 v5, s8 -; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB130_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -8070,7 +15646,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB126_1 +; GCN3-NEXT: s_cbranch_execnz .LBB130_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -8080,7 +15656,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 - %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void } @@ -8098,7 +15674,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: .LBB127_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB131_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -8112,7 +15688,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB127_1 +; GCN1-NEXT: s_cbranch_execnz .LBB131_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -8128,7 +15704,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB131_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -8142,7 +15718,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB127_1 +; GCN2-NEXT: s_cbranch_execnz .LBB131_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -8158,7 +15734,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB131_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -8172,11 +15748,11 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB127_1 +; GCN3-NEXT: s_cbranch_execnz .LBB131_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: - %tmp0 = atomicrmw min ptr %out, i64 %in seq_cst + %tmp0 = atomicrmw min ptr %out, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8194,7 +15770,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB128_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB132_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -8208,7 +15784,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB128_1 +; GCN1-NEXT: s_cbranch_execnz .LBB132_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -8229,7 +15805,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB132_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -8243,7 +15819,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB128_1 +; GCN2-NEXT: s_cbranch_execnz .LBB132_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -8264,7 +15840,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s9 ; GCN3-NEXT: v_mov_b32_e32 v5, s8 -; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB132_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -8278,7 +15854,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB128_1 +; GCN3-NEXT: s_cbranch_execnz .LBB132_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -8287,7 +15863,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index - %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void } @@ -8303,7 +15879,7 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB133_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -8317,7 +15893,7 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB129_1 +; GCN1-NEXT: s_cbranch_execnz .LBB133_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -8332,7 +15908,7 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB133_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -8346,7 +15922,7 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB129_1 +; GCN2-NEXT: s_cbranch_execnz .LBB133_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -8356,7 +15932,7 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB133_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -8370,12 +15946,12 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB129_1 +; GCN3-NEXT: s_cbranch_execnz .LBB133_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret void } @@ -8390,7 +15966,7 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB134_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -8404,7 +15980,7 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB130_1 +; GCN1-NEXT: s_cbranch_execnz .LBB134_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -8419,7 +15995,7 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB134_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -8433,7 +16009,7 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB130_1 +; GCN2-NEXT: s_cbranch_execnz .LBB134_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -8443,7 +16019,7 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB134_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -8457,14 +16033,14 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB130_1 +; GCN3-NEXT: s_cbranch_execnz .LBB134_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret i64 %result } @@ -8496,7 +16072,7 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8529,7 +16105,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8557,7 +16133,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst + %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8590,7 +16166,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst + %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8630,7 +16206,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8675,7 +16251,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8715,7 +16291,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst + %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8760,7 +16336,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst + %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8793,7 +16369,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret void } @@ -8826,7 +16402,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret i64 %result } @@ -8858,7 +16434,7 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8891,7 +16467,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8919,7 +16495,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst + %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8952,7 +16528,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst + %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8992,7 +16568,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst + %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -9037,7 +16613,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst + %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -9077,7 +16653,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst + %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -9122,7 +16698,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst + %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -9155,7 +16731,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret void } @@ -9188,8 +16764,9 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 - %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 + %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1, !amdgpu.no.remote.memory !0 ret i64 %result } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index e45b5cb30ab89..973ca51667928 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1278,7 +1278,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1307,7 +1307,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1338,7 +1338,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1365,7 +1365,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret } @@ -1390,7 +1390,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret } @@ -1419,7 +1419,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %ret } @@ -1459,7 +1459,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX940-NEXT: buffer_inv sc1 ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst + %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst, !noalias.addrspace !1 ret void } @@ -1666,3 +1666,4 @@ attributes #3 = { "denormal-fp-math"="ieee,ieee" } attributes #4 = { "denormal-fp-math"="preserve-sign,preserve-sign" } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll index 163f436ebc9bd..d55f7ca9f2baa 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll @@ -599,7 +599,7 @@ define float @test_atomicrmw_fadd_f32_flat_unsafe(ptr %ptr, float %value) #3 { ; GFX90A-NEXT: br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]] ; GFX90A: atomicrmw.shared: ; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) -; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4, !noalias.addrspace [[META1:![0-9]+]], !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: br label [[ATOMICRMW_PHI:%.*]] ; GFX90A: atomicrmw.check.private: ; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) @@ -612,7 +612,7 @@ define float @test_atomicrmw_fadd_f32_flat_unsafe(ptr %ptr, float %value) #3 { ; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] ; GFX90A: atomicrmw.global: ; GFX90A-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1) -; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VALUE]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VALUE]] syncscope("wavefront") monotonic, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] ; GFX90A: atomicrmw.phi: ; GFX90A-NEXT: [[RES:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ] @@ -621,19 +621,19 @@ define float @test_atomicrmw_fadd_f32_flat_unsafe(ptr %ptr, float %value) #3 { ; GFX90A-NEXT: ret float [[RES]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_flat_unsafe( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4, !noalias.addrspace [[META1:![0-9]+]], !amdgpu.no.fine.grained.memory [[META0]] ; GFX940-NEXT: ret float [[RES]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_flat_unsafe( -; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4, !noalias.addrspace [[META1:![0-9]+]], !amdgpu.no.fine.grained.memory [[META0]] ; GFX11-NEXT: ret float [[RES]] ; - %res = atomicrmw fadd ptr %ptr, float %value syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 + %res = atomicrmw fadd ptr %ptr, float %value syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret float %res } -define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 { -; CI-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +define double @test_atomicrmw_fadd_f64_flat_unsafe__noprivate(ptr %ptr, double %value) #3 { +; CI-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe__noprivate( ; CI-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: @@ -649,7 +649,7 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 { ; CI: atomicrmw.end: ; CI-NEXT: ret double [[TMP5]] ; -; GFX9-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +; GFX9-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe__noprivate( ; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: @@ -665,7 +665,7 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 { ; GFX9: atomicrmw.end: ; GFX9-NEXT: ret double [[TMP5]] ; -; GFX908-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +; GFX908-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe__noprivate( ; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: @@ -681,15 +681,15 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 { ; GFX908: atomicrmw.end: ; GFX908-NEXT: ret double [[TMP5]] ; -; GFX90A-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( -; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], double [[VALUE:%.*]] syncscope("wavefront") monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe__noprivate( +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], double [[VALUE:%.*]] syncscope("wavefront") monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: ret double [[RES]] ; -; GFX940-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], double [[VALUE:%.*]] syncscope("wavefront") monotonic, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe__noprivate( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], double [[VALUE:%.*]] syncscope("wavefront") monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] ; GFX940-NEXT: ret double [[RES]] ; -; GFX11-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +; GFX11-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe__noprivate( ; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: @@ -704,6 +704,167 @@ define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 { ; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX11: atomicrmw.end: ; GFX11-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, double %value syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_flat_unsafe(ptr %ptr, double %value) #3 { +; CI-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +; CI-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; CI-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; CI: atomicrmw.private: +; CI-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; CI-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; CI-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; CI-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; CI: atomicrmw.global: +; CI-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; CI-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 8 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; CI-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end1: +; CI-NEXT: br label [[ATOMICRMW_PHI]] +; CI: atomicrmw.phi: +; CI-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; CI-NEXT: br label [[ATOMICRMW_END:%.*]] +; CI: atomicrmw.end: +; CI-NEXT: ret double [[RES]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +; GFX9-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX9-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX9: atomicrmw.private: +; GFX9-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX9-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX9-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX9: atomicrmw.global: +; GFX9-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 8 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX9-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end1: +; GFX9-NEXT: br label [[ATOMICRMW_PHI]] +; GFX9: atomicrmw.phi: +; GFX9-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; GFX9-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret double [[RES]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +; GFX908-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX908-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX908: atomicrmw.private: +; GFX908-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX908-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX908-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX908: atomicrmw.global: +; GFX908-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end1: +; GFX908-NEXT: br label [[ATOMICRMW_PHI]] +; GFX908: atomicrmw.phi: +; GFX908-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; GFX908-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[RES]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX90A: atomicrmw.private: +; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX90A-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX90A: atomicrmw.global: +; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("wavefront") monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: br label [[ATOMICRMW_PHI]] +; GFX90A: atomicrmw.phi: +; GFX90A-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ] +; GFX90A-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[RES]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +; GFX940-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX940-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX940: atomicrmw.private: +; GFX940-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX940-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX940-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX940-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX940-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX940: atomicrmw.global: +; GFX940-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("wavefront") monotonic, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: br label [[ATOMICRMW_PHI]] +; GFX940: atomicrmw.phi: +; GFX940-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ] +; GFX940-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret double [[RES]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_flat_unsafe( +; GFX11-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GFX11-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GFX11: atomicrmw.private: +; GFX11-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX11-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GFX11-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GFX11: atomicrmw.global: +; GFX11-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end1: +; GFX11-NEXT: br label [[ATOMICRMW_PHI]] +; GFX11: atomicrmw.phi: +; GFX11-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; GFX11-NEXT: br label [[ATOMICRMW_END:%.*]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[RES]] ; %res = atomicrmw fadd ptr %ptr, double %value syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 ret double %res @@ -1067,6 +1228,41 @@ define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) { define double @test_atomicrmw_fadd_f64_flat(ptr %ptr, double %value) { ; ALL-LABEL: @test_atomicrmw_fadd_f64_flat( +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; ALL: atomicrmw.private: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; ALL-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; ALL: atomicrmw.global: +; ALL-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; ALL-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; ALL-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end1: +; ALL-NEXT: br label [[ATOMICRMW_PHI]] +; ALL: atomicrmw.phi: +; ALL-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; ALL-NEXT: br label [[ATOMICRMW_END:%.*]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret double [[RES]] +; + %res = atomicrmw fadd ptr %ptr, double %value seq_cst + ret double %res +} + +define double @test_atomicrmw_fadd_f64_flat__noprivate(ptr %ptr, double %value) { +; ALL-LABEL: @test_atomicrmw_fadd_f64_flat__noprivate( ; ALL-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] ; ALL: atomicrmw.start: @@ -1082,7 +1278,7 @@ define double @test_atomicrmw_fadd_f64_flat(ptr %ptr, double %value) { ; ALL: atomicrmw.end: ; ALL-NEXT: ret double [[TMP5]] ; - %res = atomicrmw fadd ptr %ptr, double %value seq_cst + %res = atomicrmw fadd ptr %ptr, double %value seq_cst, !noalias.addrspace !1 ret double %res } @@ -2619,18 +2815,31 @@ define float @test_atomicrmw_fadd_f32_flat_system_ret__amdgpu_ignore_denormal_mo define void @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode(ptr %ptr, double %value) #5 { ; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( -; ALL-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; ALL: atomicrmw.private: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; ALL-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; ALL: atomicrmw.global: +; ALL-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 ; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] ; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; ALL-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; ALL-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end1: +; ALL-NEXT: br label [[ATOMICRMW_PHI]] +; ALL: atomicrmw.phi: +; ALL-NEXT: br label [[ATOMICRMW_END:%.*]] ; ALL: atomicrmw.end: ; ALL-NEXT: ret void ; @@ -2640,20 +2849,34 @@ define void @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_ define double @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode(ptr %ptr, double %value) #5 { ; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( -; ALL-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; ALL: atomicrmw.private: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; ALL-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; ALL: atomicrmw.global: +; ALL-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 ; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] ; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; ALL-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] monotonic monotonic, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; ALL-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end1: +; ALL-NEXT: br label [[ATOMICRMW_PHI]] +; ALL: atomicrmw.phi: +; ALL-NEXT: [[RET:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; ALL-NEXT: br label [[ATOMICRMW_END:%.*]] ; ALL: atomicrmw.end: -; ALL-NEXT: ret double [[TMP5]] +; ALL-NEXT: ret double [[RET]] ; %ret = atomicrmw fadd ptr %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0 ret double %ret @@ -4495,3 +4718,4 @@ attributes #4 = { "denormal-fp-math-f32"="dynamic,dynamic" } attributes #5 = { "denormal-fp-math"="dynamic,dynamic" } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll index 2a5e1bde029ee..3cb0165075e82 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll @@ -188,6 +188,41 @@ define half @test_atomicrmw_fmax_f16_local(ptr addrspace(3) %ptr, half %value) { define double @test_atomicrmw_fmax_f64_flat(ptr %ptr, double %value) { ; GCN-LABEL: @test_atomicrmw_fmax_f64_flat( +; GCN-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GCN-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GCN: atomicrmw.private: +; GCN-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GCN-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GCN-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED_PRIVATE]], double [[VALUE:%.*]]) +; GCN-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GCN-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GCN: atomicrmw.global: +; GCN-NEXT: [[TMP3:%.*]] = load double, ptr [[PTR]], align 8 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP8:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[TMP4:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GCN-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64 +; GCN-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64 +; GCN-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] seq_cst seq_cst, align 8 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1 +; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0 +; GCN-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end1: +; GCN-NEXT: br label [[ATOMICRMW_PHI]] +; GCN: atomicrmw.phi: +; GCN-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP8]], [[ATOMICRMW_END1]] ] +; GCN-NEXT: br label [[ATOMICRMW_END:%.*]] +; GCN: atomicrmw.end: +; GCN-NEXT: ret double [[RES]] +; + %res = atomicrmw fmax ptr %ptr, double %value seq_cst + ret double %res +} + +define double @test_atomicrmw_fmax_f64_flat__noprivate(ptr %ptr, double %value) { +; GCN-LABEL: @test_atomicrmw_fmax_f64_flat__noprivate( ; GCN-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] ; GCN: atomicrmw.start: @@ -203,7 +238,7 @@ define double @test_atomicrmw_fmax_f64_flat(ptr %ptr, double %value) { ; GCN: atomicrmw.end: ; GCN-NEXT: ret double [[TMP6]] ; - %res = atomicrmw fmax ptr %ptr, double %value seq_cst + %res = atomicrmw fmax ptr %ptr, double %value seq_cst, !noalias.addrspace !0 ret double %res } @@ -257,6 +292,9 @@ define double @test_atomicrmw_fmax_f64_global_strictfp(ptr addrspace(1) %ptr, do %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst ret double %res } + +!0 = !{i32 5, i32 6} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX7: {{.*}} ; GFX9: {{.*}} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll index 0fa409b11b42f..3ab28af0872c4 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll @@ -188,6 +188,41 @@ define half @test_atomicrmw_fmin_f16_local(ptr addrspace(3) %ptr, half %value) { define double @test_atomicrmw_fmin_f64_flat(ptr %ptr, double %value) { ; GCN-LABEL: @test_atomicrmw_fmin_f64_flat( +; GCN-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GCN-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GCN: atomicrmw.private: +; GCN-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GCN-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GCN-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED_PRIVATE]], double [[VALUE:%.*]]) +; GCN-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GCN-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GCN: atomicrmw.global: +; GCN-NEXT: [[TMP3:%.*]] = load double, ptr [[PTR]], align 8 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP8:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[TMP4:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GCN-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64 +; GCN-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64 +; GCN-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] seq_cst seq_cst, align 8 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1 +; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0 +; GCN-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end1: +; GCN-NEXT: br label [[ATOMICRMW_PHI]] +; GCN: atomicrmw.phi: +; GCN-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP8]], [[ATOMICRMW_END1]] ] +; GCN-NEXT: br label [[ATOMICRMW_END:%.*]] +; GCN: atomicrmw.end: +; GCN-NEXT: ret double [[RES]] +; + %res = atomicrmw fmin ptr %ptr, double %value seq_cst + ret double %res +} + +define double @test_atomicrmw_fmin_f64_flat__noprivate(ptr %ptr, double %value) { +; GCN-LABEL: @test_atomicrmw_fmin_f64_flat__noprivate( ; GCN-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] ; GCN: atomicrmw.start: @@ -203,7 +238,7 @@ define double @test_atomicrmw_fmin_f64_flat(ptr %ptr, double %value) { ; GCN: atomicrmw.end: ; GCN-NEXT: ret double [[TMP6]] ; - %res = atomicrmw fmin ptr %ptr, double %value seq_cst + %res = atomicrmw fmin ptr %ptr, double %value seq_cst, !noalias.addrspace !0 ret double %res } @@ -257,6 +292,9 @@ define double @test_atomicrmw_fmin_f64_global_strictfp(ptr addrspace(1) %ptr, do %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst ret double %res } + +!0 = !{i32 5, i32 6} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX7: {{.*}} ; GFX9: {{.*}} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll index bbcc6b8a2724f..4c22d830f7a1c 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll @@ -198,8 +198,8 @@ define half @test_atomicrmw_fsub_f16_local(ptr addrspace(3) %ptr, half %value) { ret half %res } -define double @test_atomicrmw_fsub_f64_flat(ptr %ptr, double %value) { -; GCN-LABEL: @test_atomicrmw_fsub_f64_flat( +define double @test_atomicrmw_fsub_f64_flat__noprivate(ptr %ptr, double %value) { +; GCN-LABEL: @test_atomicrmw_fsub_f64_flat__noprivate( ; GCN-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] ; GCN: atomicrmw.start: @@ -214,6 +214,41 @@ define double @test_atomicrmw_fsub_f64_flat(ptr %ptr, double %value) { ; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GCN: atomicrmw.end: ; GCN-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fsub ptr %ptr, double %value seq_cst, !noalias.addrspace !0 + ret double %res +} + +define double @test_atomicrmw_fsub_f64_flat(ptr %ptr, double %value) { +; GCN-LABEL: @test_atomicrmw_fsub_f64_flat( +; GCN-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR:%.*]]) +; GCN-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; GCN: atomicrmw.private: +; GCN-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GCN-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GCN-NEXT: [[NEW:%.*]] = fsub double [[LOADED_PRIVATE]], [[VALUE:%.*]] +; GCN-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GCN-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; GCN: atomicrmw.global: +; GCN-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 +; GCN-NEXT: br label [[ATOMICRMW_START:%.*]] +; GCN: atomicrmw.start: +; GCN-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], [[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GCN-NEXT: [[NEW2:%.*]] = fsub double [[LOADED]], [[VALUE]] +; GCN-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GCN-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GCN-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GCN-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GCN-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START]] +; GCN: atomicrmw.end1: +; GCN-NEXT: br label [[ATOMICRMW_PHI]] +; GCN: atomicrmw.phi: +; GCN-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], [[ATOMICRMW_END1]] ] +; GCN-NEXT: br label [[ATOMICRMW_END:%.*]] +; GCN: atomicrmw.end: +; GCN-NEXT: ret double [[RES]] ; %res = atomicrmw fsub ptr %ptr, double %value seq_cst ret double %res @@ -625,3 +660,5 @@ define bfloat @test_atomicrmw_fadd_bf16_flat_system_align4(ptr %ptr, bfloat %val %res = atomicrmw fadd ptr %ptr, bfloat %value monotonic, align 4 ret bfloat %res } + +!0 = !{i32 5, i32 6} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll index 3de502874d323..cb51bcf935614 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll @@ -14,7 +14,21 @@ define i64 @test_flat_atomicrmw_add_0_i64_agent(ptr %ptr) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_add_0_i64_agent( ; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = add i64 [[LOADED_PRIVATE]], 0 +; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0:![0-9]+]], !amdgpu.no.fine.grained.memory [[META1:![0-9]+]] +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi i64 [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] +; ALL: [[ATOMICRMW_END]]: ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw add ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -24,7 +38,21 @@ define i64 @test_flat_atomicrmw_add_0_i64_agent(ptr %ptr) { define i64 @test_flat_atomicrmw_sub_0_i64_agent(ptr %ptr) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_sub_0_i64_agent( ; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw sub ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = sub i64 [[LOADED_PRIVATE]], 0 +; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi i64 [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] +; ALL: [[ATOMICRMW_END]]: ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw sub ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -34,7 +62,21 @@ define i64 @test_flat_atomicrmw_sub_0_i64_agent(ptr %ptr) { define i64 @test_flat_atomicrmw_or_0_i64_agent(ptr %ptr) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_or_0_i64_agent( ; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw or ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = or i64 [[LOADED_PRIVATE]], 0 +; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi i64 [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] +; ALL: [[ATOMICRMW_END]]: ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw or ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -44,7 +86,21 @@ define i64 @test_flat_atomicrmw_or_0_i64_agent(ptr %ptr) { define i64 @test_flat_atomicrmw_xor_0_i64_agent(ptr %ptr) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_xor_0_i64_agent( ; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw xor ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = xor i64 [[LOADED_PRIVATE]], 0 +; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi i64 [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] +; ALL: [[ATOMICRMW_END]]: ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw xor ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -58,7 +114,7 @@ define i64 @test_flat_atomicrmw_xor_0_i64_agent(ptr %ptr) { define i64 @test_flat_atomicrmw_add_0_i64_agent__noalias_addrspace_5(ptr %ptr) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_add_0_i64_agent__noalias_addrspace_5( ; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1:![0-9]+]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw add ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -68,7 +124,7 @@ define i64 @test_flat_atomicrmw_add_0_i64_agent__noalias_addrspace_5(ptr %ptr) { define i64 @test_flat_atomicrmw_sub_0_i64_agent__noalias_addrspace_5(ptr %ptr) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_sub_0_i64_agent__noalias_addrspace_5( ; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw sub ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw sub ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw sub ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -78,7 +134,7 @@ define i64 @test_flat_atomicrmw_sub_0_i64_agent__noalias_addrspace_5(ptr %ptr) { define i64 @test_flat_atomicrmw_or_0_i64_agent__noalias_addrspace_5(ptr %ptr) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_or_0_i64_agent__noalias_addrspace_5( ; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw or ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw or ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw or ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -88,7 +144,7 @@ define i64 @test_flat_atomicrmw_or_0_i64_agent__noalias_addrspace_5(ptr %ptr) { define i64 @test_flat_atomicrmw_xor_0_i64_agent__noalias_addrspace_5(ptr %ptr) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_xor_0_i64_agent__noalias_addrspace_5( ; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw xor ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw xor ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw xor ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -102,7 +158,21 @@ define i64 @test_flat_atomicrmw_xor_0_i64_agent__noalias_addrspace_5(ptr %ptr) { define i64 @test_flat_atomicrmw_add_i64_agent(ptr %ptr, i64 %value) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_add_i64_agent( ; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = add i64 [[LOADED_PRIVATE]], [[VALUE]] +; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi i64 [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] +; ALL: [[ATOMICRMW_END]]: ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw add ptr %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -112,7 +182,7 @@ define i64 @test_flat_atomicrmw_add_i64_agent(ptr %ptr, i64 %value) { define i64 @test_flat_atomicrmw_add_i64_agent__noalias_addrspace_5(ptr %ptr, i64 %value) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_add_i64_agent__noalias_addrspace_5( ; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw add ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -122,7 +192,7 @@ define i64 @test_flat_atomicrmw_add_i64_agent__noalias_addrspace_5(ptr %ptr, i64 define i32 @test_flat_atomicrmw_add_i32_agent__noalias_addrspace_5(ptr %ptr, i32 %value) { ; ALL-LABEL: define i32 @test_flat_atomicrmw_add_i32_agent__noalias_addrspace_5( ; ALL-SAME: ptr [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i32 [[RES]] ; %res = atomicrmw add ptr %ptr, i32 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -136,7 +206,20 @@ define i32 @test_flat_atomicrmw_add_i32_agent__noalias_addrspace_5(ptr %ptr, i32 define i64 @test_flat_atomicrmw_xchg_i64_agent(ptr %ptr, i64 %value) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_xchg_i64_agent( ; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: store i64 [[VALUE]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi i64 [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] +; ALL: [[ATOMICRMW_END]]: ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw xchg ptr %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -146,7 +229,7 @@ define i64 @test_flat_atomicrmw_xchg_i64_agent(ptr %ptr, i64 %value) { define i64 @test_flat_atomicrmw_xchg_i64_agent__noalias_xchgrspace_5(ptr %ptr, i64 %value) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_xchg_i64_agent__noalias_xchgrspace_5( ; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw xchg ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -156,7 +239,7 @@ define i64 @test_flat_atomicrmw_xchg_i64_agent__noalias_xchgrspace_5(ptr %ptr, i define i32 @test_flat_atomicrmw_xchg_i32_agent__noalias_xchgrspace_5(ptr %ptr, i32 %value) { ; ALL-LABEL: define i32 @test_flat_atomicrmw_xchg_i32_agent__noalias_xchgrspace_5( ; ALL-SAME: ptr [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i32 [[RES]] ; %res = atomicrmw xchg ptr %ptr, i32 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -170,7 +253,20 @@ define i32 @test_flat_atomicrmw_xchg_i32_agent__noalias_xchgrspace_5(ptr %ptr, i define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent(ptr %ptr, ptr addrspace(1) %value) { ; ALL-LABEL: define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent( ; ALL-SAME: ptr [[PTR:%.*]], ptr addrspace(1) [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(1) [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: store ptr addrspace(1) [[VALUE]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(1) [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi ptr addrspace(1) [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] +; ALL: [[ATOMICRMW_END]]: ; ALL-NEXT: ret ptr addrspace(1) [[RES]] ; %res = atomicrmw xchg ptr %ptr, ptr addrspace(1) %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -180,7 +276,7 @@ define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent(ptr %ptr, ptr addrspa define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent__noalias_xchgrspace_5(ptr %ptr, ptr addrspace(1) %value) { ; ALL-LABEL: define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent__noalias_xchgrspace_5( ; ALL-SAME: ptr [[PTR:%.*]], ptr addrspace(1) [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(1) [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(1) [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret ptr addrspace(1) [[RES]] ; %res = atomicrmw xchg ptr %ptr, ptr addrspace(1) %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -190,7 +286,7 @@ define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent__noalias_xchgrspace_5 define ptr addrspace(3) @test_flat_atomicrmw_xchg_p3_agent__noalias_xchgrspace_5(ptr %ptr, ptr addrspace(3) %value) { ; ALL-LABEL: define ptr addrspace(3) @test_flat_atomicrmw_xchg_p3_agent__noalias_xchgrspace_5( ; ALL-SAME: ptr [[PTR:%.*]], ptr addrspace(3) [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(3) [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(3) [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret ptr addrspace(3) [[RES]] ; %res = atomicrmw xchg ptr %ptr, ptr addrspace(3) %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -204,7 +300,21 @@ define ptr addrspace(3) @test_flat_atomicrmw_xchg_p3_agent__noalias_xchgrspace_5 define i64 @test_flat_atomicrmw_and_i64_agent(ptr %ptr, i64 %value) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent( ; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: [[NEW:%.*]] = and i64 [[LOADED_PRIVATE]], [[VALUE]] +; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP2:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi i64 [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] +; ALL: [[ATOMICRMW_END]]: ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw and ptr %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -214,7 +324,7 @@ define i64 @test_flat_atomicrmw_and_i64_agent(ptr %ptr, i64 %value) { define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5(ptr %ptr, i64 %value) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5( ; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw and ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -224,7 +334,7 @@ define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5(ptr %ptr, i64 define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, i64 %value) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__maybe_fine_grained( ; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]] ; ALL-NEXT: ret i64 [[RES]] ; %res = atomicrmw and ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1 @@ -235,7 +345,7 @@ define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__maybe_fine_g define i32 @test_flat_atomicrmw_and_i32_agent__noalias_addrspace_5(ptr %ptr, i32 %value) { ; ALL-LABEL: define i32 @test_flat_atomicrmw_and_i32_agent__noalias_addrspace_5( ; ALL-SAME: ptr [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; ALL-NEXT: [[RES:%.*]] = atomicrmw and ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; ALL-NEXT: ret i32 [[RES]] ; %res = atomicrmw and ptr %ptr, i32 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -249,81 +359,165 @@ define i32 @test_flat_atomicrmw_and_i32_agent__noalias_addrspace_5(ptr %ptr, i32 define double @test_flat_atomicrmw_fadd_f64_agent(ptr %ptr, double %value) { ; GFX7-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( ; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX7-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX7-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX7-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX7: [[ATOMICRMW_PRIVATE]]: +; GFX7-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX7-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX7-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] +; GFX7-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX7-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX7: [[ATOMICRMW_GLOBAL]]: +; GFX7-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 ; GFX7-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX7: [[ATOMICRMW_START]]: -; GFX7-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] -; GFX7-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] -; GFX7-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX7-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX7-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX7-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], %[[ATOMICRMW_START]] ] +; GFX7-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX7-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX7-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX7-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX7-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX7-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX7-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX7-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX7: [[ATOMICRMW_END1]]: +; GFX7-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX7: [[ATOMICRMW_PHI]]: +; GFX7-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], %[[ATOMICRMW_END1]] ] +; GFX7-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX7: [[ATOMICRMW_END]]: -; GFX7-NEXT: ret double [[TMP5]] +; GFX7-NEXT: ret double [[RES]] ; ; GFX900-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( ; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX900-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX900: [[ATOMICRMW_PRIVATE]]: +; GFX900-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX900-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX900-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] +; GFX900-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX900: [[ATOMICRMW_GLOBAL]]: +; GFX900-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 ; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX900: [[ATOMICRMW_START]]: -; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] -; GFX900-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] -; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX900-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX900-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END1]]: +; GFX900-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX900: [[ATOMICRMW_PHI]]: +; GFX900-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], %[[ATOMICRMW_END1]] ] +; GFX900-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX900: [[ATOMICRMW_END]]: -; GFX900-NEXT: ret double [[TMP5]] +; GFX900-NEXT: ret double [[RES]] ; ; GFX908-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( ; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX908-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX908: [[ATOMICRMW_PRIVATE]]: +; GFX908-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX908-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] +; GFX908-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX908: [[ATOMICRMW_GLOBAL]]: +; GFX908-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 ; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX908: [[ATOMICRMW_START]]: -; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] -; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] -; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double -; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END1]]: +; GFX908-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX908: [[ATOMICRMW_PHI]]: +; GFX908-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], %[[ATOMICRMW_END1]] ] +; GFX908-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX908: [[ATOMICRMW_END]]: ; GFX908-NEXT: ret double [[RES]] ; ; GFX90A-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( ; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX90A: [[ATOMICRMW_PRIVATE]]: +; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] +; GFX90A-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX90A: [[ATOMICRMW_GLOBAL]]: +; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX90A: [[ATOMICRMW_PHI]]: +; GFX90A-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; GFX90A-NEXT: br label %[[ATOMICRMW_END:.*]] +; GFX90A: [[ATOMICRMW_END]]: ; GFX90A-NEXT: ret double [[RES]] ; ; GFX940-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( ; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX940-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX940: [[ATOMICRMW_PRIVATE]]: +; GFX940-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX940-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX940-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] +; GFX940-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX940-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX940: [[ATOMICRMW_GLOBAL]]: +; GFX940-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX940-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX940: [[ATOMICRMW_PHI]]: +; GFX940-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ] +; GFX940-NEXT: br label %[[ATOMICRMW_END:.*]] +; GFX940: [[ATOMICRMW_END]]: ; GFX940-NEXT: ret double [[RES]] ; ; GFX12-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent( ; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX12-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX12: [[ATOMICRMW_PRIVATE]]: +; GFX12-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX12-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED_PRIVATE]], [[VALUE]] +; GFX12-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX12: [[ATOMICRMW_GLOBAL]]: +; GFX12-NEXT: [[TMP2:%.*]] = load double, ptr [[PTR]], align 8 ; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX12: [[ATOMICRMW_START]]: -; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] -; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] -; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP2]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP6:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW2:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[NEW2]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END1]]: +; GFX12-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX12: [[ATOMICRMW_PHI]]: +; GFX12-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP6]], %[[ATOMICRMW_END1]] ] +; GFX12-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX12: [[ATOMICRMW_END]]: -; GFX12-NEXT: ret double [[TMP5]] +; GFX12-NEXT: ret double [[RES]] ; %res = atomicrmw fadd ptr %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %res @@ -383,12 +577,12 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5(ptr %ptr, ; ; GFX90A-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5( ; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX90A-NEXT: ret double [[RES]] ; ; GFX940-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5( ; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX940-NEXT: ret double [[RES]] ; ; GFX12-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5( @@ -483,7 +677,7 @@ define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fi ; ; GFX940-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained( ; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX940-NEXT: ret double [[RES]] ; ; GFX12-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained( @@ -565,7 +759,7 @@ define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(ptr %ptr, ; GFX90A-NEXT: br i1 [[IS_SHARED]], label %[[ATOMICRMW_SHARED:.*]], label %[[ATOMICRMW_CHECK_PRIVATE:.*]] ; GFX90A: [[ATOMICRMW_SHARED]]: ; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3) -; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]] ; GFX90A-NEXT: br label %[[ATOMICRMW_PHI:.*]] ; GFX90A: [[ATOMICRMW_CHECK_PRIVATE]]: ; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) @@ -578,7 +772,7 @@ define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(ptr %ptr, ; GFX90A-NEXT: br label %[[ATOMICRMW_PHI]] ; GFX90A: [[ATOMICRMW_GLOBAL]]: ; GFX90A-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1) -; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]] ; GFX90A-NEXT: br label %[[ATOMICRMW_PHI]] ; GFX90A: [[ATOMICRMW_PHI]]: ; GFX90A-NEXT: [[LOADED_PHI:%.*]] = phi float [ [[TMP2]], %[[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], %[[ATOMICRMW_GLOBAL]] ] @@ -588,12 +782,12 @@ define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(ptr %ptr, ; ; GFX940-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5( ; GFX940-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]] ; GFX940-NEXT: ret float [[RES]] ; ; GFX12-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5( ; GFX12-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]] ; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fadd ptr %ptr, float %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 @@ -671,12 +865,12 @@ define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5(ptr ; ; GFX940-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5( ; GFX940-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX940-NEXT: ret <2 x half> [[RES]] ; ; GFX12-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5( ; GFX12-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -754,12 +948,12 @@ define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( ; ; GFX940-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( ; GFX940-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX940-NEXT: ret <2 x bfloat> [[RES]] ; ; GFX12-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( ; GFX12-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -773,69 +967,153 @@ define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5( define double @test_flat_atomicrmw_fmin_f64_agent(ptr %ptr, double %value) { ; GFX7-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( ; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX7-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX7: [[ATOMICRMW_PRIVATE]]: +; GFX7-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX7-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX7-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX7-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX7-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX7: [[ATOMICRMW_GLOBAL]]: +; GFX7-NEXT: [[TMP3:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX7-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX7: [[ATOMICRMW_PHI]]: +; GFX7-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ] +; GFX7-NEXT: br label %[[ATOMICRMW_END:.*]] +; GFX7: [[ATOMICRMW_END]]: ; GFX7-NEXT: ret double [[RES]] ; ; GFX900-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( ; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX900-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX900: [[ATOMICRMW_PRIVATE]]: +; GFX900-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX900-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX900-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX900-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX900: [[ATOMICRMW_GLOBAL]]: +; GFX900-NEXT: [[TMP3:%.*]] = load double, ptr [[PTR]], align 8 ; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX900: [[ATOMICRMW_START]]: -; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] -; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) -; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP8:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[TMP4:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX900-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64 +; GFX900-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0 +; GFX900-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END1]]: +; GFX900-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX900: [[ATOMICRMW_PHI]]: +; GFX900-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP8]], %[[ATOMICRMW_END1]] ] +; GFX900-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX900: [[ATOMICRMW_END]]: -; GFX900-NEXT: ret double [[TMP5]] +; GFX900-NEXT: ret double [[RES]] ; ; GFX908-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( ; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX908-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX908: [[ATOMICRMW_PRIVATE]]: +; GFX908-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX908-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX908-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX908: [[ATOMICRMW_GLOBAL]]: +; GFX908-NEXT: [[TMP3:%.*]] = load double, ptr [[PTR]], align 8 ; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX908: [[ATOMICRMW_START]]: -; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] -; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) -; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double -; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP8:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP4:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64 +; GFX908-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0 +; GFX908-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END1]]: +; GFX908-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX908: [[ATOMICRMW_PHI]]: +; GFX908-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP8]], %[[ATOMICRMW_END1]] ] +; GFX908-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX908: [[ATOMICRMW_END]]: ; GFX908-NEXT: ret double [[RES]] ; ; GFX90A-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( ; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX90A: [[ATOMICRMW_PRIVATE]]: +; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX90A-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX90A: [[ATOMICRMW_GLOBAL]]: +; GFX90A-NEXT: [[TMP3:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX90A: [[ATOMICRMW_PHI]]: +; GFX90A-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ] +; GFX90A-NEXT: br label %[[ATOMICRMW_END:.*]] +; GFX90A: [[ATOMICRMW_END]]: ; GFX90A-NEXT: ret double [[RES]] ; ; GFX940-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( ; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX940-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX940: [[ATOMICRMW_PRIVATE]]: +; GFX940-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX940-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX940-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX940-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX940-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX940: [[ATOMICRMW_GLOBAL]]: +; GFX940-NEXT: [[TMP3:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX940-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX940: [[ATOMICRMW_PHI]]: +; GFX940-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ] +; GFX940-NEXT: br label %[[ATOMICRMW_END:.*]] +; GFX940: [[ATOMICRMW_END]]: ; GFX940-NEXT: ret double [[RES]] ; ; GFX12-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent( ; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX12-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX12: [[ATOMICRMW_PRIVATE]]: +; GFX12-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX12-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX12-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX12: [[ATOMICRMW_GLOBAL]]: +; GFX12-NEXT: [[TMP3:%.*]] = load double, ptr [[PTR]], align 8 ; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX12: [[ATOMICRMW_START]]: -; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] -; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) -; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP8:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[TMP4:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64 +; GFX12-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0 +; GFX12-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END1]]: +; GFX12-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX12: [[ATOMICRMW_PHI]]: +; GFX12-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP8]], %[[ATOMICRMW_END1]] ] +; GFX12-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX12: [[ATOMICRMW_END]]: -; GFX12-NEXT: ret double [[TMP5]] +; GFX12-NEXT: ret double [[RES]] ; %res = atomicrmw fmin ptr %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %res @@ -844,7 +1122,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent(ptr %ptr, double %value) { define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5(ptr %ptr, double %value) { ; GFX7-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( ; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX7-NEXT: ret double [[RES]] ; ; GFX900-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( @@ -883,12 +1161,12 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5(ptr %ptr, ; ; GFX90A-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( ; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX90A-NEXT: ret double [[RES]] ; ; GFX940-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( ; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX940-NEXT: ret double [[RES]] ; ; GFX12-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5( @@ -983,7 +1261,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fi ; ; GFX940-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained( ; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX940-NEXT: ret double [[RES]] ; ; GFX12-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained( @@ -1010,7 +1288,7 @@ define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fi define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(ptr %ptr, float %value) { ; GFX7-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5( ; GFX7-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX7-NEXT: ret float [[RES]] ; ; GFX900-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5( @@ -1083,7 +1361,7 @@ define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(ptr %ptr, ; ; GFX12-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5( ; GFX12-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmin ptr %ptr, float %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -1097,69 +1375,153 @@ define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(ptr %ptr, define double @test_flat_atomicrmw_fmax_f64_agent(ptr %ptr, double %value) { ; GFX7-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( ; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX7-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX7: [[ATOMICRMW_PRIVATE]]: +; GFX7-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX7-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX7-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX7-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX7-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX7: [[ATOMICRMW_GLOBAL]]: +; GFX7-NEXT: [[TMP3:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX7-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX7: [[ATOMICRMW_PHI]]: +; GFX7-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ] +; GFX7-NEXT: br label %[[ATOMICRMW_END:.*]] +; GFX7: [[ATOMICRMW_END]]: ; GFX7-NEXT: ret double [[RES]] ; ; GFX900-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( ; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX900-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX900-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX900-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX900: [[ATOMICRMW_PRIVATE]]: +; GFX900-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX900-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX900-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX900-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX900-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX900: [[ATOMICRMW_GLOBAL]]: +; GFX900-NEXT: [[TMP3:%.*]] = load double, ptr [[PTR]], align 8 ; GFX900-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX900: [[ATOMICRMW_START]]: -; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] -; GFX900-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) -; GFX900-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX900-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX900-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX900-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP8:%.*]], %[[ATOMICRMW_START]] ] +; GFX900-NEXT: [[TMP4:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX900-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64 +; GFX900-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64 +; GFX900-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0 +; GFX900-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double +; GFX900-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX900: [[ATOMICRMW_END1]]: +; GFX900-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX900: [[ATOMICRMW_PHI]]: +; GFX900-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP8]], %[[ATOMICRMW_END1]] ] +; GFX900-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX900: [[ATOMICRMW_END]]: -; GFX900-NEXT: ret double [[TMP5]] +; GFX900-NEXT: ret double [[RES]] ; ; GFX908-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( ; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX908-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX908-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX908: [[ATOMICRMW_PRIVATE]]: +; GFX908-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX908-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX908-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX908-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX908: [[ATOMICRMW_GLOBAL]]: +; GFX908-NEXT: [[TMP3:%.*]] = load double, ptr [[PTR]], align 8 ; GFX908-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX908: [[ATOMICRMW_START]]: -; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] -; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) -; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX908-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double -; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP8:%.*]], %[[ATOMICRMW_START]] ] +; GFX908-NEXT: [[TMP4:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX908-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64 +; GFX908-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0 +; GFX908-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX908: [[ATOMICRMW_END1]]: +; GFX908-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX908: [[ATOMICRMW_PHI]]: +; GFX908-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP8]], %[[ATOMICRMW_END1]] ] +; GFX908-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX908: [[ATOMICRMW_END]]: ; GFX908-NEXT: ret double [[RES]] ; ; GFX90A-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( ; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX90A-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX90A: [[ATOMICRMW_PRIVATE]]: +; GFX90A-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX90A-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX90A-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX90A: [[ATOMICRMW_GLOBAL]]: +; GFX90A-NEXT: [[TMP3:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX90A-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX90A: [[ATOMICRMW_PHI]]: +; GFX90A-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ] +; GFX90A-NEXT: br label %[[ATOMICRMW_END:.*]] +; GFX90A: [[ATOMICRMW_END]]: ; GFX90A-NEXT: ret double [[RES]] ; ; GFX940-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( ; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX940-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX940: [[ATOMICRMW_PRIVATE]]: +; GFX940-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX940-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX940-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX940-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX940-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX940: [[ATOMICRMW_GLOBAL]]: +; GFX940-NEXT: [[TMP3:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] +; GFX940-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX940: [[ATOMICRMW_PHI]]: +; GFX940-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ] +; GFX940-NEXT: br label %[[ATOMICRMW_END:.*]] +; GFX940: [[ATOMICRMW_END]]: ; GFX940-NEXT: ret double [[RES]] ; ; GFX12-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent( ; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR]], align 8 +; GFX12-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; GFX12-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; GFX12: [[ATOMICRMW_PRIVATE]]: +; GFX12-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; GFX12-NEXT: [[LOADED_PRIVATE:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED_PRIVATE]], double [[VALUE]]) +; GFX12-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP1]], align 8 +; GFX12-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; GFX12: [[ATOMICRMW_GLOBAL]]: +; GFX12-NEXT: [[TMP3:%.*]] = load double, ptr [[PTR]], align 8 ; GFX12-NEXT: br label %[[ATOMICRMW_START:.*]] ; GFX12: [[ATOMICRMW_START]]: -; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ] -; GFX12-NEXT: [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) -; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 -; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 -; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8 -; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 -; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 -; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double -; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ], [ [[TMP8:%.*]], %[[ATOMICRMW_START]] ] +; GFX12-NEXT: [[TMP4:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; GFX12-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to i64 +; GFX12-NEXT: [[TMP6:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP7:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP6]], i64 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP7]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP7]], 0 +; GFX12-NEXT: [[TMP8]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; GFX12: [[ATOMICRMW_END1]]: +; GFX12-NEXT: br label %[[ATOMICRMW_PHI]] +; GFX12: [[ATOMICRMW_PHI]]: +; GFX12-NEXT: [[RES:%.*]] = phi double [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP8]], %[[ATOMICRMW_END1]] ] +; GFX12-NEXT: br label %[[ATOMICRMW_END:.*]] ; GFX12: [[ATOMICRMW_END]]: -; GFX12-NEXT: ret double [[TMP5]] +; GFX12-NEXT: ret double [[RES]] ; %res = atomicrmw fmax ptr %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %res @@ -1168,7 +1530,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent(ptr %ptr, double %value) { define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5(ptr %ptr, double %value) { ; GFX7-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( ; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX7-NEXT: ret double [[RES]] ; ; GFX900-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( @@ -1207,12 +1569,12 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5(ptr %ptr, ; ; GFX90A-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( ; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX90A-NEXT: ret double [[RES]] ; ; GFX940-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( ; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX940-NEXT: ret double [[RES]] ; ; GFX12-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5( @@ -1307,7 +1669,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fi ; ; GFX940-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained( ; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]] +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META0]] ; GFX940-NEXT: ret double [[RES]] ; ; GFX12-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained( @@ -1334,7 +1696,7 @@ define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fi define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(ptr %ptr, float %value) { ; GFX7-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5( ; GFX7-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX7-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX7-NEXT: ret float [[RES]] ; ; GFX900-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5( @@ -1407,7 +1769,7 @@ define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(ptr %ptr, ; ; GFX12-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5( ; GFX12-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { -; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]] +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META0]], !amdgpu.no.fine.grained.memory [[META1]] ; GFX12-NEXT: ret float [[RES]] ; %res = atomicrmw fmax ptr %ptr, float %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 @@ -1421,16 +1783,31 @@ define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(ptr %ptr, define i64 @test_flat_atomicrmw_nand_i64_agent(ptr %ptr, i64 %value) { ; ALL-LABEL: define i64 @test_flat_atomicrmw_nand_i64_agent( ; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; ALL-NEXT: [[TMP1:%.*]] = load i64, ptr [[PTR]], align 8 -; ALL-NEXT: br label %[[ATOMICRMW_START:.*]] -; ALL: [[ATOMICRMW_START]]: -; ALL-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ] +; ALL-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; ALL-NEXT: br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]] +; ALL: [[ATOMICRMW_PRIVATE]]: +; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; ALL-NEXT: [[LOADED:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 ; ALL-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] ; ALL-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 -; ALL-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 -; ALL-NEXT: [[RES]] = extractvalue { i64, i1 } [[TMP3]], 0 -; ALL-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]] +; ALL-NEXT: store i64 [[NEW]], ptr addrspace(5) [[TMP1]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_PHI:.*]] +; ALL: [[ATOMICRMW_GLOBAL]]: +; ALL-NEXT: [[TMP3:%.*]] = load i64, ptr [[PTR]], align 8 +; ALL-NEXT: br label %[[ATOMICRMW_START:.*]] +; ALL: [[ATOMICRMW_START]]: +; ALL-NEXT: [[LOADED1:%.*]] = phi i64 [ [[TMP3]], %[[ATOMICRMW_GLOBAL]] ], [ [[NEWLOADED:%.*]], %[[ATOMICRMW_START]] ] +; ALL-NEXT: [[TMP4:%.*]] = and i64 [[LOADED1]], [[VALUE]] +; ALL-NEXT: [[NEW2:%.*]] = xor i64 [[TMP4]], -1 +; ALL-NEXT: [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED1]], i64 [[NEW2]] syncscope("agent") seq_cst seq_cst, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP5]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label %[[ATOMICRMW_END1:.*]], label %[[ATOMICRMW_START]] +; ALL: [[ATOMICRMW_END1]]: +; ALL-NEXT: br label %[[ATOMICRMW_PHI]] +; ALL: [[ATOMICRMW_PHI]]: +; ALL-NEXT: [[RES:%.*]] = phi i64 [ [[LOADED]], %[[ATOMICRMW_PRIVATE]] ], [ [[NEWLOADED]], %[[ATOMICRMW_END1]] ] +; ALL-NEXT: br label %[[ATOMICRMW_END:.*]] ; ALL: [[ATOMICRMW_END]]: ; ALL-NEXT: ret i64 [[RES]] ; @@ -1503,21 +1880,21 @@ define i32 @test_flat_atomicrmw_nand_i32_agent__noalias_addrspace_5(ptr %ptr, i3 !1 = !{i32 5, i32 6} ;. -; GFX7: [[META0]] = !{} -; GFX7: [[META1]] = !{i32 5, i32 6} +; GFX7: [[META0]] = !{i32 5, i32 6} +; GFX7: [[META1]] = !{} ;. -; GFX900: [[META0]] = !{} -; GFX900: [[META1]] = !{i32 5, i32 6} +; GFX900: [[META0]] = !{i32 5, i32 6} +; GFX900: [[META1]] = !{} ;. -; GFX908: [[META0]] = !{} -; GFX908: [[META1]] = !{i32 5, i32 6} +; GFX908: [[META0]] = !{i32 5, i32 6} +; GFX908: [[META1]] = !{} ;. -; GFX90A: [[META0]] = !{} -; GFX90A: [[META1]] = !{i32 5, i32 6} +; GFX90A: [[META0]] = !{i32 5, i32 6} +; GFX90A: [[META1]] = !{} ;. -; GFX940: [[META0]] = !{} -; GFX940: [[META1]] = !{i32 5, i32 6} +; GFX940: [[META0]] = !{i32 5, i32 6} +; GFX940: [[META1]] = !{} ;. -; GFX12: [[META0]] = !{} -; GFX12: [[META1]] = !{i32 5, i32 6} +; GFX12: [[META0]] = !{i32 5, i32 6} +; GFX12: [[META1]] = !{} ;. diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll index 8e5b7806a5904..80058b3cef4ea 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll @@ -35,8 +35,22 @@ define i32 @test_atomicrmw_or_0_global_one_as(ptr addrspace(1) %ptr) { define i32 @test_atomicrmw_or_0_flat_system(ptr %ptr) { ; CHECK-LABEL: define i32 @test_atomicrmw_or_0_flat_system( ; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i32 0 seq_cst, align 4 -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) +; CHECK-NEXT: br i1 [[IS_PRIVATE]], label [[ATOMICRMW_PRIVATE:%.*]], label [[ATOMICRMW_GLOBAL:%.*]] +; CHECK: atomicrmw.private: +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) +; CHECK-NEXT: [[LOADED_PRIVATE:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4 +; CHECK-NEXT: [[NEW:%.*]] = or i32 [[LOADED_PRIVATE]], 0 +; CHECK-NEXT: store i32 [[NEW]], ptr addrspace(5) [[TMP1]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_PHI:%.*]] +; CHECK: atomicrmw.global: +; CHECK-NEXT: [[RES:%.*]] = atomicrmw add ptr [[PTR]], i32 0 seq_cst, align 4, !noalias.addrspace [[META1:![0-9]+]] +; CHECK-NEXT: br label [[ATOMICRMW_PHI]] +; CHECK: atomicrmw.phi: +; CHECK-NEXT: [[RES1:%.*]] = phi i32 [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[RES]], [[ATOMICRMW_GLOBAL]] ] +; CHECK-NEXT: br label [[ATOMICRMW_END:%.*]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: ret i32 [[RES1]] ; %res = atomicrmw or ptr %ptr, i32 0 seq_cst ret i32 %res diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll index a0856ac9127e6..e2dada85ef872 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -24,7 +24,7 @@ entry: %i = add nsw i32 %a, -1 %i.2 = sext i32 %i to i64 %i.3 = getelementptr inbounds double, ptr %b, i64 %i.2 - %i.4 = atomicrmw fadd ptr %i.3, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + %i.4 = atomicrmw fadd ptr %i.3, double %c syncscope("agent") seq_cst, align 8, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -59,7 +59,7 @@ entry: %i.2 = sext i32 %i to i64 %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2 %i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr - %0 = atomicrmw fadd ptr %i.4, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + %0 = atomicrmw fadd ptr %i.4, double %c syncscope("agent") seq_cst, align 8, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -107,9 +107,9 @@ bb1: ; preds = %entry %i.7 = ptrtoint ptr addrspace(1) %i.3 to i64 %i.8 = add nsw i64 %i.7, 1 %i.9 = inttoptr i64 %i.8 to ptr addrspace(1) - %0 = atomicrmw fadd ptr %d, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + %0 = atomicrmw fadd ptr %d, double %c syncscope("agent") seq_cst, align 8, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 %i.11 = addrspacecast ptr addrspace(1) %i.9 to ptr - %1 = atomicrmw fadd ptr %i.11, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + %1 = atomicrmw fadd ptr %i.11, double %c syncscope("agent") seq_cst, align 8, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -175,3 +175,4 @@ attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite attributes #1 = { mustprogress nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" } !0 = !{} +!1 = !{i32 5, i32 6}