diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 7ce1359f03da6..760c7087f677a 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -338,8 +338,8 @@ class WaitcntBrackets { const MachineOperand &Op) const; bool counterOutOfOrder(InstCounterType T) const; - void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; - void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; + void simplifyWaitcnt(AMDGPU::Waitcnt &Wait, bool OptNone) const; + void simplifyWaitcnt(InstCounterType T, unsigned &Count, bool OptNone) const; void determineWait(InstCounterType T, RegInterval Interval, AMDGPU::Waitcnt &Wait) const; @@ -1164,22 +1164,33 @@ void WaitcntBrackets::print(raw_ostream &OS) const { /// Simplify the waitcnt, in the sense of removing redundant counts, and return /// whether a waitcnt instruction is needed at all. -void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { - simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt); - simplifyWaitcnt(EXP_CNT, Wait.ExpCnt); - simplifyWaitcnt(DS_CNT, Wait.DsCnt); - simplifyWaitcnt(STORE_CNT, Wait.StoreCnt); - simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); - simplifyWaitcnt(BVH_CNT, Wait.BvhCnt); - simplifyWaitcnt(KM_CNT, Wait.KmCnt); - simplifyWaitcnt(X_CNT, Wait.XCnt); +void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait, + bool OptNone) const { + simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt, OptNone); + simplifyWaitcnt(EXP_CNT, Wait.ExpCnt, OptNone); + simplifyWaitcnt(DS_CNT, Wait.DsCnt, OptNone); + simplifyWaitcnt(STORE_CNT, Wait.StoreCnt, OptNone); + simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt, OptNone); + simplifyWaitcnt(BVH_CNT, Wait.BvhCnt, OptNone); + simplifyWaitcnt(KM_CNT, Wait.KmCnt, OptNone); + simplifyWaitcnt(X_CNT, Wait.XCnt, OptNone); } -void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, - unsigned &Count) const { +void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, unsigned &Count, + bool OptNone) const { // The number of outstanding events for this type, T, can be calculated // as (UB - LB). If the current Count is greater than or equal to the number // of outstanding events, then the wait for this counter is redundant. + // + // For counts that are at max value or above, try this even when optimizations + // are disabled. This helps remove max waitcnt's that are inserted by the + // memory legalizer by default, but does not optimize actual waitcnt's that + // are otherwise inserted by the memory legalizer or a previous pass of the + // inserter. The corner case is when a max waitcnt was optimized away although + // it was not just a default, but was deliberately chosen. This only + // marginally affects the usefulness of OptNone. + if (Count < getWaitCountMax(T) && OptNone) + return; if (Count >= getScoreRange(T)) Count = ~0u; } @@ -1363,19 +1374,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( } unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode()); - bool TrySimplify = Opcode != II.getOpcode() && !OptNone; + bool OpcodeIsSoft = Opcode != II.getOpcode(); // Update required wait count. If this is a soft waitcnt (= it was added // by an earlier pass), it may be entirely removed. if (Opcode == AMDGPU::S_WAITCNT) { unsigned IEnc = II.getOperand(0).getImm(); AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc); - if (TrySimplify) - ScoreBrackets.simplifyWaitcnt(OldWait); + if (OpcodeIsSoft) + ScoreBrackets.simplifyWaitcnt(OldWait, OptNone); Wait = Wait.combined(OldWait); // Merge consecutive waitcnt of the same type by erasing multiples. - if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) { + if (WaitcntInstr || + (!Wait.hasWaitExceptStoreCnt() && OpcodeIsSoft && !OptNone)) { II.eraseFromParent(); Modified = true; } else @@ -1386,11 +1398,13 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( unsigned OldVSCnt = TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); - if (TrySimplify) - ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt); + if (OpcodeIsSoft) + ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt, + OptNone); Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt); - if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) { + if (WaitcntVsCntInstr || + (!Wait.hasWaitStoreCnt() && OpcodeIsSoft && !OptNone)) { II.eraseFromParent(); Modified = true; } else @@ -1528,7 +1542,7 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( // by an earlier pass), it may be entirely removed. unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode()); - bool TrySimplify = Opcode != II.getOpcode() && !OptNone; + bool OpcodeIsSoft = Opcode != II.getOpcode(); // Don't crash if the programmer used legacy waitcnt intrinsics, but don't // attempt to do more than that either. @@ -1539,16 +1553,16 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( unsigned OldEnc = TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc); - if (TrySimplify) - ScoreBrackets.simplifyWaitcnt(OldWait); + if (OpcodeIsSoft) + ScoreBrackets.simplifyWaitcnt(OldWait, OptNone); Wait = Wait.combined(OldWait); UpdatableInstr = &CombinedLoadDsCntInstr; } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) { unsigned OldEnc = TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc); - if (TrySimplify) - ScoreBrackets.simplifyWaitcnt(OldWait); + if (OpcodeIsSoft) + ScoreBrackets.simplifyWaitcnt(OldWait, OptNone); Wait = Wait.combined(OldWait); UpdatableInstr = &CombinedStoreDsCntInstr; } else { @@ -1556,8 +1570,8 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( assert(CT.has_value()); unsigned OldCnt = TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); - if (TrySimplify) - ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt); + if (OpcodeIsSoft) + ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt, OptNone); addWait(Wait, CT.value(), OldCnt); UpdatableInstr = &WaitInstrs[CT.value()]; } @@ -2009,7 +2023,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, } // Verify that the wait is actually needed. - ScoreBrackets.simplifyWaitcnt(Wait); + ScoreBrackets.simplifyWaitcnt(Wait, /* OptNone = */ false); // When forcing emit, we need to skip terminators because that would break the // terminators of the MBB if we emit a waitcnt between terminators. @@ -2238,7 +2252,7 @@ bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst, NeedsEndPGMCheck = true; } - ScoreBrackets.simplifyWaitcnt(Wait); + ScoreBrackets.simplifyWaitcnt(Wait, /* OptNone = */ false); auto SuccessorIt = std::next(Inst.getIterator()); bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets, diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 3212060f303a5..cc04d257db323 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -1149,7 +1149,11 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, } } - if (VMCnt || LGKMCnt) { + // Always emit a soft wait count at a release, even if it is trivially ~0. + // SIInsertWaitcnts will later add additional waits such as those required + // from direct load to LDS (formerly known as LDS DMA). + if (VMCnt || LGKMCnt || + (isReleaseOrStronger(Order) && Scope >= SIAtomicScope::WORKGROUP)) { unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt(IV, VMCnt ? 0 : getVmcntBitMask(IV), @@ -2057,7 +2061,11 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, } } - if (VMCnt || LGKMCnt) { + // Always emit a soft wait count at a release, even if it is trivially ~0. + // SIInsertWaitcnts will later add additional waits such as those required + // from direct load to LDS (formerly known as LDS DMA). + if (VMCnt || LGKMCnt || + (isReleaseOrStronger(Order) && Scope >= SIAtomicScope::WORKGROUP)) { unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt(IV, VMCnt ? 0 : getVmcntBitMask(IV), @@ -2373,6 +2381,13 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, } BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0); Changed = true; + } else if (isReleaseOrStronger(Order) && Scope >= SIAtomicScope::WORKGROUP) { + // Always emit a soft wait count at a release, even if it is trivially ~0. + // SIInsertWaitcnts will later add additional waits such as those required + // from direct load to LDS (formerly known as LDS DMA). + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)) + .addImm(getLoadcntBitMask(IV)); + Changed = true; } if (STORECnt) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll index 66037615f0ba0..c5f4891f13ab8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll @@ -536,10 +536,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_release() #0 { ; GFX6-LABEL: name: workgroup_one_as_release ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_one_as_release ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_one_as_release @@ -550,6 +552,7 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 { ; ; GFX10CU-LABEL: name: workgroup_one_as_release ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_release @@ -560,6 +563,7 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 { ; ; GFX11CU-LABEL: name: workgroup_one_as_release ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("workgroup-one-as") release @@ -569,10 +573,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 { ; GFX6-LABEL: name: workgroup_one_as_acq_rel ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_one_as_acq_rel ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_one_as_acq_rel @@ -584,6 +590,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 { ; ; GFX10CU-LABEL: name: workgroup_one_as_acq_rel ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel @@ -595,6 +602,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 { ; ; GFX11CU-LABEL: name: workgroup_one_as_acq_rel ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("workgroup-one-as") acq_rel @@ -604,10 +612,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 { ; GFX6-LABEL: name: workgroup_one_as_seq_cst ; GFX6: bb.0.entry: + ; GFX6-NEXT: S_WAITCNT_soft 3967 ; GFX6-NEXT: S_ENDPGM 0 ; ; GFX8-LABEL: name: workgroup_one_as_seq_cst ; GFX8: bb.0.entry: + ; GFX8-NEXT: S_WAITCNT_soft 3967 ; GFX8-NEXT: S_ENDPGM 0 ; ; GFX10WGP-LABEL: name: workgroup_one_as_seq_cst @@ -619,6 +629,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 { ; ; GFX10CU-LABEL: name: workgroup_one_as_seq_cst ; GFX10CU: bb.0.entry: + ; GFX10CU-NEXT: S_WAITCNT_soft 65407 ; GFX10CU-NEXT: S_ENDPGM 0 ; ; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst @@ -630,6 +641,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 { ; ; GFX11CU-LABEL: name: workgroup_one_as_seq_cst ; GFX11CU: bb.0.entry: + ; GFX11CU-NEXT: S_WAITCNT_soft 65527 ; GFX11CU-NEXT: S_ENDPGM 0 entry: fence syncscope("workgroup-one-as") seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll index 1379eb61e0853..b9d4b0a3ef5ba 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll @@ -86,10 +86,12 @@ entry: define amdgpu_kernel void @workgroup_release_fence() { ; GFX6-LABEL: workgroup_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_release_fence: @@ -100,14 +102,17 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX10-CU-LABEL: workgroup_release_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_release_fence: @@ -117,6 +122,7 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_release_fence: @@ -132,6 +138,7 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX11-CU-LABEL: workgroup_release_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_release_fence: @@ -153,10 +160,12 @@ entry: define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX6-LABEL: workgroup_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_acq_rel_fence: @@ -168,14 +177,17 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX10-CU-LABEL: workgroup_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_acq_rel_fence: @@ -186,6 +198,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_acq_rel_fence: @@ -203,6 +216,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX11-CU-LABEL: workgroup_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_acq_rel_fence: @@ -225,10 +239,12 @@ entry: define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX6-LABEL: workgroup_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_seq_cst_fence: @@ -240,14 +256,17 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX10-CU-LABEL: workgroup_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_seq_cst_fence: @@ -258,6 +277,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_seq_cst_fence: @@ -275,6 +295,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX11-CU-LABEL: workgroup_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_seq_cst_fence: @@ -367,10 +388,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX6-LABEL: workgroup_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_release_fence: @@ -381,14 +404,17 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_release_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_release_fence: @@ -398,6 +424,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_release_fence: @@ -413,6 +440,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_release_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_release_fence: @@ -434,10 +462,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX6-LABEL: workgroup_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_acq_rel_fence: @@ -449,14 +479,17 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: @@ -467,6 +500,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: @@ -484,6 +518,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence: @@ -506,10 +541,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX6-LABEL: workgroup_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_seq_cst_fence: @@ -521,14 +558,17 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: @@ -539,6 +579,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: @@ -556,6 +597,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll index 971015b391ca8..6d9777479be61 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll @@ -114,6 +114,7 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX90A-TGSPLIT-LABEL: workgroup_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_release_fence: @@ -123,6 +124,7 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX942-TGSPLIT-LABEL: workgroup_release_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: workgroup_release_fence: @@ -180,6 +182,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX90A-TGSPLIT-LABEL: workgroup_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence: @@ -189,6 +192,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX942-TGSPLIT-LABEL: workgroup_acq_rel_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: workgroup_acq_rel_fence: @@ -246,6 +250,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX90A-TGSPLIT-LABEL: workgroup_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence: @@ -255,6 +260,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX942-TGSPLIT-LABEL: workgroup_seq_cst_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: workgroup_seq_cst_fence: @@ -339,46 +345,57 @@ entry: define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX6-LABEL: workgroup_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_release_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: workgroup_one_as_release_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: workgroup_one_as_release_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: workgroup_one_as_release_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_release_fence: @@ -396,46 +413,57 @@ entry: define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX6-LABEL: workgroup_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_acq_rel_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: workgroup_one_as_acq_rel_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence: @@ -453,46 +481,57 @@ entry: define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX6-LABEL: workgroup_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_seq_cst_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: workgroup_one_as_seq_cst_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence: @@ -608,6 +647,7 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX90A-TGSPLIT-LABEL: agent_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: agent_release_fence: @@ -617,6 +657,7 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX942-TGSPLIT-LABEL: agent_release_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: agent_release_fence: @@ -674,6 +715,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX90A-TGSPLIT-LABEL: agent_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: agent_acq_rel_fence: @@ -683,6 +725,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX942-TGSPLIT-LABEL: agent_acq_rel_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: agent_acq_rel_fence: @@ -740,6 +783,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX90A-TGSPLIT-LABEL: agent_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: agent_seq_cst_fence: @@ -749,6 +793,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX942-TGSPLIT-LABEL: agent_seq_cst_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: agent_seq_cst_fence: @@ -833,46 +878,57 @@ entry: define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX6-LABEL: agent_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_one_as_release_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: agent_one_as_release_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: agent_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_one_as_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: agent_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_one_as_release_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: agent_one_as_release_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: agent_one_as_release_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: agent_one_as_release_fence: @@ -890,46 +946,57 @@ entry: define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX6-LABEL: agent_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_one_as_acq_rel_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: agent_one_as_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: agent_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_one_as_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: agent_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_one_as_acq_rel_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: agent_one_as_acq_rel_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: agent_one_as_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: agent_one_as_acq_rel_fence: @@ -947,46 +1014,57 @@ entry: define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX6-LABEL: agent_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: agent_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: agent_one_as_seq_cst_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: agent_one_as_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: agent_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: agent_one_as_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: agent_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: agent_one_as_seq_cst_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: agent_one_as_seq_cst_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: agent_one_as_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: agent_one_as_seq_cst_fence: @@ -1102,6 +1180,7 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX90A-TGSPLIT-LABEL: system_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: system_release_fence: @@ -1111,6 +1190,7 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX942-TGSPLIT-LABEL: system_release_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: system_release_fence: @@ -1168,6 +1248,7 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX90A-TGSPLIT-LABEL: system_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: system_acq_rel_fence: @@ -1177,6 +1258,7 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX942-TGSPLIT-LABEL: system_acq_rel_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: system_acq_rel_fence: @@ -1234,6 +1316,7 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX90A-TGSPLIT-LABEL: system_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: system_seq_cst_fence: @@ -1243,6 +1326,7 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX942-TGSPLIT-LABEL: system_seq_cst_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: system_seq_cst_fence: @@ -1327,46 +1411,57 @@ entry: define amdgpu_kernel void @system_one_as_release_fence() { ; GFX6-LABEL: system_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_one_as_release_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: system_one_as_release_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: system_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: system_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_one_as_release_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: system_one_as_release_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: system_one_as_release_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: system_one_as_release_fence: @@ -1384,46 +1479,57 @@ entry: define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX6-LABEL: system_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_one_as_acq_rel_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: system_one_as_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: system_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_one_as_acq_rel_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: system_one_as_acq_rel_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: system_one_as_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: system_one_as_acq_rel_fence: @@ -1441,46 +1547,57 @@ entry: define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX6-LABEL: system_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: system_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: system_one_as_seq_cst_fence: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: system_one_as_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: system_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: system_one_as_seq_cst_fence: ; GFX942-TGSPLIT: ; %bb.0: ; %entry +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: system_one_as_seq_cst_fence: ; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: system_one_as_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: system_one_as_seq_cst_fence: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll index 0e459ed0f1243..688798027eef7 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -1311,10 +1311,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX6-LABEL: workgroup_one_as_release_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_release_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_release_fence: @@ -1325,14 +1327,17 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_release_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_release_fence: @@ -1342,6 +1347,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_release_fence: @@ -1357,6 +1363,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_release_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_release_fence: @@ -1378,10 +1385,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX6-LABEL: workgroup_one_as_acq_rel_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_acq_rel_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_acq_rel_fence: @@ -1393,14 +1402,17 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: @@ -1411,6 +1423,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: @@ -1428,6 +1441,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence: @@ -1450,10 +1464,12 @@ entry: define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX6-LABEL: workgroup_one_as_seq_cst_fence: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: workgroup_one_as_seq_cst_fence: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: workgroup_one_as_seq_cst_fence: @@ -1465,14 +1481,17 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: @@ -1483,6 +1502,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX942-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: ; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: @@ -1500,6 +1520,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 07ad8cb0c4a3d..66162608da39f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -14008,6 +14008,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -14028,6 +14029,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -14115,6 +14117,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -14131,6 +14134,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -14208,6 +14212,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -14228,6 +14233,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -14315,6 +14321,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -14331,6 +14338,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -15914,6 +15922,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -15948,6 +15957,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16069,6 +16079,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16089,6 +16100,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16203,6 +16215,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16237,6 +16250,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16358,6 +16372,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16378,6 +16393,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -17022,6 +17038,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -17056,6 +17073,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -17177,6 +17195,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -17197,6 +17216,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -17311,6 +17331,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -17345,6 +17366,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -17466,6 +17488,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -17486,6 +17509,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -17600,6 +17624,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -17634,6 +17659,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -17755,6 +17781,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -17775,6 +17802,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -18467,6 +18495,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -18501,6 +18530,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -18622,6 +18652,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -18642,6 +18673,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -18756,6 +18788,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -18790,6 +18823,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -18911,6 +18945,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -18931,6 +18966,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -19045,6 +19081,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -19079,6 +19116,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -19200,6 +19238,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -19220,6 +19259,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index 919fc3e8f4e4f..9ac088f7133bc 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -14210,6 +14210,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -14230,6 +14231,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -14321,6 +14323,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -14337,6 +14340,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -14416,6 +14420,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -14436,6 +14441,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -14527,6 +14533,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -14543,6 +14550,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16148,6 +16156,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16182,6 +16191,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16307,6 +16317,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16327,6 +16338,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16443,6 +16455,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16477,6 +16490,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16602,6 +16616,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16622,6 +16637,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -17272,6 +17288,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -17306,6 +17323,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -17431,6 +17449,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -17451,6 +17470,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -17567,6 +17587,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -17601,6 +17622,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -17726,6 +17748,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -17746,6 +17769,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -17862,6 +17886,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -17896,6 +17921,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -18021,6 +18047,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -18041,6 +18068,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -18747,6 +18775,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -18781,6 +18810,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -18906,6 +18936,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -18926,6 +18957,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -19042,6 +19074,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -19076,6 +19109,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -19201,6 +19235,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -19221,6 +19256,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -19337,6 +19373,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -19371,6 +19408,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -19496,6 +19534,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -19516,6 +19555,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index 0fd4aa4a7a93f..d992b0d1d63f5 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -11018,7 +11018,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11058,7 +11060,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11072,7 +11076,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11087,7 +11093,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11115,7 +11123,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11160,7 +11170,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -11519,6 +11531,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11551,6 +11564,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -11562,6 +11576,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11574,6 +11589,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11597,6 +11613,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11632,6 +11649,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -11678,6 +11696,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11710,6 +11729,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -11721,6 +11741,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11733,6 +11754,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11756,6 +11778,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11791,6 +11814,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -12145,6 +12169,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12177,6 +12202,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -12188,6 +12214,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12200,6 +12227,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12223,6 +12251,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12258,6 +12287,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_endpgm ; @@ -12304,7 +12334,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12322,6 +12354,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -12338,7 +12371,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12349,7 +12384,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12361,7 +12398,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12386,7 +12425,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12413,6 +12454,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -12425,7 +12467,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: @@ -12473,7 +12517,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12491,6 +12537,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -12507,7 +12554,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12518,7 +12567,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12530,7 +12581,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12555,7 +12608,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12582,6 +12637,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -12594,7 +12650,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: @@ -12844,7 +12902,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12886,7 +12946,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12901,7 +12963,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12917,7 +12981,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12947,7 +13013,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12995,7 +13063,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13058,7 +13128,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13100,7 +13172,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13115,7 +13189,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13131,7 +13207,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13161,7 +13239,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13209,7 +13289,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13772,6 +13854,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -13832,6 +13915,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; @@ -13857,6 +13941,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13873,6 +13958,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13904,6 +13990,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13951,6 +14038,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -14020,7 +14108,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14052,6 +14142,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -14082,7 +14173,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14107,7 +14200,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14123,7 +14218,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14156,7 +14253,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14191,6 +14290,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -14207,7 +14307,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14278,7 +14380,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14310,6 +14414,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -14340,7 +14445,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14365,7 +14472,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14381,7 +14490,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14414,7 +14525,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14449,6 +14562,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -14465,7 +14579,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -15032,7 +15148,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15064,6 +15182,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -15094,7 +15213,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15119,7 +15240,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15135,7 +15258,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15168,7 +15293,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15203,6 +15330,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -15219,7 +15347,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: @@ -15290,7 +15420,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15322,6 +15454,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -15352,7 +15485,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15377,7 +15512,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15393,7 +15530,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15426,7 +15565,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15461,6 +15602,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -15477,7 +15619,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15548,7 +15692,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15580,6 +15726,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -15610,7 +15757,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15635,7 +15784,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15651,7 +15802,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15684,7 +15837,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15719,6 +15874,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -15735,7 +15891,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15806,6 +15964,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -15868,6 +16027,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; @@ -15893,6 +16053,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -15909,6 +16070,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -15942,6 +16104,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -15993,6 +16156,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -16064,6 +16228,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -16126,6 +16291,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; @@ -16151,6 +16317,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -16167,6 +16334,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -16200,6 +16368,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -16251,6 +16420,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -16322,7 +16492,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16354,6 +16526,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -16384,7 +16557,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16409,7 +16584,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16425,7 +16602,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16458,7 +16637,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16493,6 +16674,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -16509,7 +16691,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16580,7 +16764,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16612,6 +16798,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -16642,7 +16829,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16667,7 +16856,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16683,7 +16874,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16716,7 +16909,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16751,6 +16946,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -16767,7 +16963,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16838,7 +17036,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16870,6 +17070,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -16900,7 +17101,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16925,7 +17128,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16941,7 +17146,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16974,7 +17181,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -17009,6 +17218,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -17025,7 +17235,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -17672,6 +17884,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -17740,6 +17953,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -17769,6 +17983,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -17789,6 +18004,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17826,6 +18042,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17883,6 +18100,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -17966,7 +18184,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18036,7 +18256,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18065,7 +18287,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18085,7 +18309,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18123,7 +18349,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18183,7 +18411,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18270,7 +18500,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18340,7 +18572,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18369,7 +18603,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -18389,7 +18625,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18427,7 +18665,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -18487,7 +18727,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19160,7 +19402,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19230,7 +19474,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19259,7 +19505,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19279,7 +19527,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19317,7 +19567,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19377,7 +19629,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19464,7 +19718,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19534,7 +19790,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19563,7 +19821,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19583,7 +19843,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19621,7 +19883,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19681,7 +19945,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19768,7 +20034,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19838,7 +20106,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19867,7 +20137,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -19887,7 +20159,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19925,7 +20199,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -19985,7 +20261,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20072,6 +20350,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -20142,6 +20421,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -20171,6 +20451,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -20191,6 +20472,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20229,6 +20511,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20289,6 +20572,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -20376,6 +20660,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -20446,6 +20731,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 @@ -20475,6 +20761,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -20495,6 +20782,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20533,6 +20821,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20593,6 +20882,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 @@ -20678,7 +20968,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20748,7 +21040,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20777,7 +21071,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20797,7 +21093,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20835,7 +21133,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -20895,7 +21195,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -20982,7 +21284,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21052,7 +21356,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21081,7 +21387,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21101,7 +21409,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -21139,7 +21449,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -21199,7 +21511,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21286,7 +21600,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21356,7 +21672,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21385,7 +21703,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -21405,7 +21725,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -21443,7 +21765,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -21503,7 +21827,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index 74a72e04fa4ae..375fb084313a3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -2038,6 +2038,7 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -2053,6 +2054,7 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -2141,6 +2143,7 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -2156,6 +2159,7 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -2246,6 +2250,7 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -2261,6 +2266,7 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -2349,6 +2355,7 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -2364,6 +2371,7 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -3843,6 +3851,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -3862,6 +3871,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -3975,6 +3985,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -3994,6 +4005,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -4116,6 +4128,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -4135,6 +4148,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -4248,6 +4262,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -4267,6 +4282,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -4885,6 +4901,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -4904,6 +4921,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -5017,6 +5035,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5036,6 +5055,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5158,6 +5178,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -5177,6 +5198,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -5290,6 +5312,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5309,6 +5332,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5431,6 +5455,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -5450,6 +5475,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -5563,6 +5589,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5582,6 +5609,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -6250,6 +6278,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -6269,6 +6298,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -6382,6 +6412,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -6401,6 +6432,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -6523,6 +6555,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -6542,6 +6575,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -6655,6 +6689,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -6674,6 +6709,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -6796,6 +6832,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -6815,6 +6852,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -6928,6 +6966,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -6947,6 +6986,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13311,6 +13351,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -13326,6 +13367,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -13414,6 +13456,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -13429,6 +13472,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13519,6 +13563,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -13534,6 +13579,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -13622,6 +13668,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -13637,6 +13684,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -15116,6 +15164,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -15135,6 +15184,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -15248,6 +15298,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -15267,6 +15318,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -15389,6 +15441,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -15408,6 +15461,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -15521,6 +15575,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -15540,6 +15595,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16158,6 +16214,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16177,6 +16234,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16290,6 +16348,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16309,6 +16368,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16431,6 +16491,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16450,6 +16511,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16563,6 +16625,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16582,6 +16645,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16704,6 +16768,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16723,6 +16788,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16836,6 +16902,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16855,6 +16922,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -17523,6 +17591,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -17542,6 +17611,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -17655,6 +17725,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -17674,6 +17745,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -17796,6 +17868,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -17815,6 +17888,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -17928,6 +18002,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -17947,6 +18022,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -18069,6 +18145,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -18088,6 +18165,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -18201,6 +18279,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -18220,6 +18299,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index be148464c156e..4b5487904e351 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -2056,6 +2056,7 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -2071,6 +2072,7 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -2163,6 +2165,7 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -2178,6 +2181,7 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -2270,6 +2274,7 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -2285,6 +2290,7 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -2377,6 +2383,7 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -2392,6 +2399,7 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -3893,6 +3901,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -3912,6 +3921,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -4029,6 +4039,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -4048,6 +4059,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -4172,6 +4184,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -4191,6 +4204,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -4308,6 +4322,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -4327,6 +4342,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -4951,6 +4967,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -4970,6 +4987,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -5087,6 +5105,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5106,6 +5125,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5230,6 +5250,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -5249,6 +5270,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -5366,6 +5388,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5385,6 +5408,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5509,6 +5533,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -5528,6 +5553,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -5645,6 +5671,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5664,6 +5691,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5788,6 +5816,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -5807,6 +5836,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -5924,6 +5954,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5943,6 +5974,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -12117,6 +12149,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -12132,6 +12165,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -12224,6 +12258,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -12239,6 +12274,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -12331,6 +12367,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -12346,6 +12383,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -12438,6 +12476,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -12453,6 +12492,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13954,6 +13994,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -13973,6 +14014,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -14090,6 +14132,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -14109,6 +14152,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -14233,6 +14277,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -14252,6 +14297,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -14369,6 +14415,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -14388,6 +14435,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -15012,6 +15060,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -15031,6 +15080,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -15148,6 +15198,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -15167,6 +15218,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -15291,6 +15343,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -15310,6 +15363,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -15427,6 +15481,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -15446,6 +15501,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -15570,6 +15626,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -15589,6 +15646,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -15706,6 +15764,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -15725,6 +15784,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16407,6 +16467,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16426,6 +16487,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16543,6 +16605,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16562,6 +16625,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16686,6 +16750,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16705,6 +16770,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -16822,6 +16888,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -16841,6 +16908,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv @@ -16965,6 +17033,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -16984,6 +17053,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: buffer_gl0_inv @@ -17101,6 +17171,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -17120,6 +17191,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: buffer_gl0_inv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 69b0c7f93ab0e..a528c3ac4df3e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -609,6 +609,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1938,6 +1939,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw: @@ -1953,6 +1955,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_atomicrmw: @@ -1965,6 +1968,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -1978,6 +1982,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_atomicrmw: @@ -1996,6 +2001,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: @@ -2007,6 +2013,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: @@ -2031,6 +2038,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: @@ -2056,6 +2064,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -2069,6 +2078,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_atomicrmw: @@ -2120,6 +2130,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw: @@ -2135,6 +2146,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_atomicrmw: @@ -2147,6 +2159,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -2160,6 +2173,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_atomicrmw: @@ -2178,6 +2192,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: @@ -2189,6 +2204,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: @@ -2213,6 +2229,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: @@ -2238,6 +2255,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -2251,6 +2269,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_atomicrmw: @@ -2507,6 +2526,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2715,6 +2735,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3596,6 +3617,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: @@ -3625,6 +3647,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: @@ -3641,6 +3664,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -3658,6 +3682,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: @@ -3681,6 +3706,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: @@ -3696,6 +3722,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: @@ -3728,6 +3755,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: @@ -3761,6 +3789,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -3778,6 +3807,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: @@ -3843,6 +3873,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: @@ -3872,6 +3903,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: @@ -3888,6 +3920,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -3905,6 +3938,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: @@ -3928,6 +3962,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: @@ -3943,6 +3978,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: @@ -3975,6 +4011,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: @@ -4008,6 +4045,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -4025,6 +4063,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: @@ -4548,6 +4587,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg: @@ -4577,6 +4617,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_release_acquire_cmpxchg: @@ -4593,6 +4634,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -4610,6 +4652,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_cmpxchg: @@ -4633,6 +4676,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: @@ -4648,6 +4692,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: @@ -4680,6 +4725,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: @@ -4713,6 +4759,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -4730,6 +4777,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_release_acquire_cmpxchg: @@ -4795,6 +4843,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: @@ -4824,6 +4873,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: @@ -4840,6 +4890,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -4857,6 +4908,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: @@ -4880,6 +4932,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: @@ -4895,6 +4948,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: @@ -4927,6 +4981,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: @@ -4960,6 +5015,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -4977,6 +5033,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: @@ -5042,6 +5099,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: @@ -5071,6 +5129,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: @@ -5087,6 +5146,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -5104,6 +5164,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: @@ -5127,6 +5188,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: @@ -5142,6 +5204,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: @@ -5174,6 +5237,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: @@ -5207,6 +5271,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -5224,6 +5289,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: @@ -5783,6 +5849,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_release_seq_cst_cmpxchg: @@ -5812,6 +5879,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg: @@ -5828,6 +5896,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -5845,6 +5914,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_cmpxchg: @@ -5868,6 +5938,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: @@ -5883,6 +5954,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: @@ -5915,6 +5987,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: @@ -5948,6 +6021,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -5965,6 +6039,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg: @@ -6030,6 +6105,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: @@ -6059,6 +6135,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: @@ -6075,6 +6152,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -6092,6 +6170,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: @@ -6115,6 +6194,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: @@ -6130,6 +6210,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: @@ -6162,6 +6243,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: @@ -6195,6 +6277,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -6212,6 +6295,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: @@ -6277,6 +6361,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6306,6 +6391,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6322,6 +6408,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -6339,6 +6426,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6362,6 +6450,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6377,6 +6466,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6409,6 +6499,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: @@ -6442,6 +6533,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -6459,6 +6551,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: @@ -7300,8 +7393,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -7332,6 +7425,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7395,8 +7489,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -7576,8 +7670,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -7608,6 +7702,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7671,8 +7766,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8366,8 +8461,8 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -8398,6 +8493,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8461,8 +8557,8 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8642,8 +8738,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -8674,6 +8770,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8737,8 +8834,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -8918,8 +9015,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -8950,6 +9047,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9013,8 +9111,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -9744,8 +9842,8 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -9776,6 +9874,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -9839,8 +9938,8 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10020,8 +10119,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -10052,6 +10151,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10115,8 +10215,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10296,8 +10396,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v1, v2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -10328,6 +10428,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -10391,8 +10492,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11128,6 +11229,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 s5, s14 ; GFX6-NEXT: s_mov_b32 s6, s13 ; GFX6-NEXT: s_mov_b32 s7, s12 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -11143,7 +11245,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -11194,6 +11298,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -11632,6 +11737,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -11646,6 +11752,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11668,6 +11775,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -11686,6 +11794,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11696,6 +11805,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11717,6 +11827,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11750,6 +11861,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -11798,6 +11910,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -11812,6 +11925,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -11834,6 +11948,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -11852,6 +11967,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11862,6 +11978,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11883,6 +12000,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11916,6 +12034,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -12281,6 +12400,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -12295,6 +12415,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -12317,6 +12438,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX10-CU-NEXT: s_endpgm ; @@ -12334,6 +12456,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -12344,6 +12467,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12365,6 +12489,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -12398,6 +12523,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_endpgm ; @@ -12445,7 +12571,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: @@ -12459,7 +12587,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: @@ -12472,6 +12602,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -12483,7 +12614,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: @@ -12500,7 +12633,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: @@ -12510,7 +12645,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: @@ -12533,7 +12670,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: @@ -12559,6 +12698,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -12570,7 +12710,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: @@ -12619,7 +12761,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: @@ -12633,7 +12777,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: @@ -12646,6 +12792,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -12657,7 +12804,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: @@ -12674,7 +12823,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: @@ -12684,7 +12835,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: @@ -12707,7 +12860,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: @@ -12733,6 +12888,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -12744,7 +12900,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: @@ -12981,6 +13139,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -12997,7 +13156,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -13026,6 +13187,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -13045,6 +13207,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -13057,6 +13220,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -13083,6 +13247,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -13124,6 +13289,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -13181,6 +13347,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_mov_b32 s6, s10 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -13197,7 +13364,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -13226,6 +13395,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -13245,6 +13415,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -13257,6 +13428,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -13283,6 +13455,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -13324,6 +13497,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -13834,6 +14008,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -13862,6 +14037,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -13892,6 +14068,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -13914,6 +14091,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13928,6 +14106,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13957,6 +14136,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -14002,6 +14182,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -14063,7 +14244,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14091,7 +14274,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14108,6 +14293,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -14123,7 +14309,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14145,7 +14333,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14159,7 +14349,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14190,7 +14382,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14224,6 +14418,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -14239,7 +14434,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -14302,7 +14499,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14330,7 +14529,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14347,6 +14548,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -14362,7 +14564,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14384,7 +14588,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14398,7 +14604,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14429,7 +14637,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14463,6 +14673,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -14478,7 +14689,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -14999,7 +15212,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: @@ -15027,7 +15242,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: @@ -15044,6 +15261,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -15059,7 +15277,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: @@ -15081,7 +15301,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: @@ -15095,7 +15317,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: @@ -15126,7 +15350,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: @@ -15160,6 +15386,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -15175,7 +15402,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: @@ -15238,7 +15467,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15266,7 +15497,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15283,6 +15516,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -15298,7 +15532,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15320,7 +15556,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15334,7 +15572,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15365,7 +15605,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15399,6 +15641,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -15414,7 +15657,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -15477,7 +15722,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15505,7 +15752,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15522,6 +15771,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -15537,7 +15787,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15559,7 +15811,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15573,7 +15827,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15604,7 +15860,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15638,6 +15896,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -15653,7 +15912,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -15716,6 +15977,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -15744,6 +16006,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -15776,6 +16039,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -15798,6 +16062,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -15812,6 +16077,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -15843,6 +16109,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -15892,6 +16159,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -15955,6 +16223,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -15983,6 +16252,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; @@ -16015,6 +16285,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -16037,6 +16308,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -16051,6 +16323,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -16082,6 +16355,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -16131,6 +16405,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -16194,7 +16469,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16222,7 +16499,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16239,6 +16518,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -16254,7 +16534,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16276,7 +16558,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16290,7 +16574,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16321,7 +16607,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16355,6 +16643,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -16370,7 +16659,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: @@ -16433,7 +16724,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16461,7 +16754,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16478,6 +16773,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -16493,7 +16789,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16515,7 +16813,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16529,7 +16829,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16560,7 +16862,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16594,6 +16898,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -16609,7 +16914,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -16672,7 +16979,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16700,7 +17009,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16717,6 +17028,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -16732,7 +17044,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16754,7 +17068,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16768,7 +17084,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16799,7 +17117,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -16833,6 +17153,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_endpgm @@ -16848,7 +17169,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -17418,6 +17741,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -17449,6 +17773,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -17485,6 +17810,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -17509,6 +17835,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -17526,6 +17853,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -17559,6 +17887,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -17610,6 +17939,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -17679,9 +18009,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -17710,7 +18041,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -17747,6 +18080,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -17771,9 +18105,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -17788,6 +18123,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -17822,6 +18158,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -17875,6 +18212,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -17947,9 +18285,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -17978,7 +18317,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -18015,6 +18356,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -18039,9 +18381,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18056,6 +18399,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18090,6 +18434,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18143,6 +18488,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18729,9 +19075,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -18760,7 +19107,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -18797,6 +19146,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -18821,9 +19171,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -18838,6 +19189,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -18872,6 +19224,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -18925,6 +19278,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18997,9 +19351,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -19028,7 +19383,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -19065,6 +19422,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -19089,9 +19447,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19106,6 +19465,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19140,6 +19500,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -19193,6 +19554,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19265,9 +19627,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -19296,7 +19659,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -19333,6 +19698,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -19357,9 +19723,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -19374,6 +19741,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19408,6 +19776,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -19461,6 +19830,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19533,6 +19903,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -19564,6 +19935,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -19601,6 +19973,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -19625,6 +19998,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -19642,6 +20016,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19676,6 +20051,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -19729,6 +20105,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19801,6 +20178,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -19832,6 +20210,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -19869,6 +20248,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -19893,6 +20273,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -19910,6 +20291,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -19944,6 +20326,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -19997,6 +20380,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20067,9 +20451,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -20098,7 +20483,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -20135,6 +20522,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -20159,9 +20547,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20176,6 +20565,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -20210,6 +20600,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -20263,6 +20654,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20335,9 +20727,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -20366,7 +20759,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -20403,6 +20798,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -20427,9 +20823,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20444,6 +20841,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -20478,6 +20876,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -20531,6 +20930,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20603,9 +21003,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -20634,7 +21035,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -20671,6 +21074,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] @@ -20695,9 +21099,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -20712,6 +21117,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] @@ -20746,6 +21152,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -20799,6 +21206,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll index 0467c5047a0be..d97881cbf6a28 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -628,6 +628,7 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -655,6 +656,7 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1863,6 +1865,7 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1887,6 +1890,7 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -2037,6 +2041,7 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -2061,6 +2066,7 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -2414,6 +2420,7 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2443,6 +2450,7 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2620,6 +2628,7 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2649,6 +2658,7 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3360,6 +3370,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3388,6 +3399,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -3561,6 +3573,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3589,6 +3602,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4128,6 +4142,7 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4156,6 +4171,7 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4329,6 +4345,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4357,6 +4374,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4530,6 +4548,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4558,6 +4577,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5133,6 +5153,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -5161,6 +5182,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5334,6 +5356,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -5362,6 +5385,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5535,6 +5559,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -5563,6 +5588,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -6407,6 +6433,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6440,6 +6467,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6641,6 +6669,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6674,6 +6703,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7307,6 +7337,7 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7340,6 +7371,7 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7541,6 +7573,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7574,6 +7607,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7775,6 +7809,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7808,6 +7843,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8477,6 +8513,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8510,6 +8547,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8711,6 +8749,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8744,6 +8783,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8945,6 +8985,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8978,6 +9019,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9585,7 +9627,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9599,7 +9643,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9612,7 +9658,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -9624,7 +9672,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -9637,7 +9687,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9650,7 +9702,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9662,7 +9716,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9674,7 +9730,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9686,7 +9744,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9698,7 +9758,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -9710,7 +9772,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -10036,6 +10100,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10047,6 +10112,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10057,6 +10123,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -10067,6 +10134,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -10078,6 +10146,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10088,6 +10157,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10098,6 +10168,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10108,6 +10179,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10118,6 +10190,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10128,6 +10201,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -10138,6 +10212,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -10176,6 +10251,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10187,6 +10263,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10197,6 +10274,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -10207,6 +10285,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -10218,6 +10297,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10228,6 +10308,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10238,6 +10319,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10248,6 +10330,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10258,6 +10341,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10268,6 +10352,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -10278,6 +10363,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -10596,6 +10682,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10607,6 +10694,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10617,6 +10705,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -10627,6 +10716,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -10638,6 +10728,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10648,6 +10739,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10658,6 +10750,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10668,6 +10761,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10678,6 +10772,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10688,6 +10783,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -10698,6 +10794,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -10736,7 +10833,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10747,7 +10846,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10757,7 +10858,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10767,7 +10870,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10778,7 +10883,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10788,7 +10895,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10798,7 +10907,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10808,7 +10919,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10818,7 +10931,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10828,7 +10943,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10838,7 +10955,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw: @@ -10876,7 +10995,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10887,7 +11008,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10897,7 +11020,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10907,7 +11032,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10918,7 +11045,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10928,7 +11057,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10938,7 +11069,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10948,7 +11081,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10958,7 +11093,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10968,7 +11105,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -10978,7 +11117,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw: @@ -11199,7 +11340,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11214,7 +11357,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11228,7 +11373,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -11241,7 +11388,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -11255,7 +11404,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11269,7 +11420,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11282,7 +11435,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11295,7 +11450,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11308,7 +11465,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11321,7 +11480,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -11334,7 +11495,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -11382,7 +11545,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11397,7 +11562,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11411,7 +11578,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -11424,7 +11593,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -11438,7 +11609,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11452,7 +11625,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11465,7 +11640,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11478,7 +11655,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11491,7 +11670,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11504,7 +11685,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -11517,7 +11700,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -11901,6 +12086,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -11914,6 +12100,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -11926,6 +12113,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11938,6 +12126,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -11951,6 +12140,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11963,6 +12153,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11975,6 +12166,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -11987,6 +12179,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11999,6 +12192,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -12011,6 +12205,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -12023,6 +12218,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -12068,7 +12264,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12081,7 +12279,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12093,7 +12293,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12105,7 +12307,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12118,7 +12322,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12130,7 +12336,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12142,7 +12350,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12154,7 +12364,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12166,7 +12378,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12178,7 +12392,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12190,7 +12406,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: @@ -12235,7 +12453,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12248,7 +12468,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12260,7 +12482,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12272,7 +12496,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12285,7 +12511,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12297,7 +12525,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12309,7 +12539,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12321,7 +12553,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12333,7 +12567,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12345,7 +12581,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12357,7 +12595,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: @@ -12736,7 +12976,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12749,7 +12991,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12761,7 +13005,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12773,7 +13019,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12786,7 +13034,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12798,7 +13048,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12810,7 +13062,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12822,7 +13076,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12834,7 +13090,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12846,7 +13104,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12858,7 +13118,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg: @@ -12903,7 +13165,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12916,7 +13180,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12928,7 +13194,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12940,7 +13208,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12953,7 +13223,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12965,7 +13237,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12977,7 +13251,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -12989,7 +13265,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -13001,7 +13279,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -13013,7 +13293,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -13025,7 +13307,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: @@ -13070,7 +13354,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13083,7 +13369,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13095,7 +13383,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13107,7 +13397,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13120,7 +13412,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13132,7 +13426,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13144,7 +13440,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13156,7 +13454,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13168,7 +13468,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13180,7 +13482,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13192,7 +13496,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: @@ -13237,6 +13543,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -13250,6 +13557,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -13262,6 +13570,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -13274,6 +13583,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -13287,6 +13597,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13299,6 +13610,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13311,6 +13623,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -13323,6 +13636,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13335,6 +13649,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -13347,6 +13662,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -13359,6 +13675,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -13404,6 +13721,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -13417,6 +13735,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -13429,6 +13748,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -13441,6 +13761,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -13454,6 +13775,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13466,6 +13788,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13478,6 +13801,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -13490,6 +13814,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13502,6 +13827,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -13514,6 +13840,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -13526,6 +13853,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -13571,7 +13899,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13584,7 +13914,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13596,7 +13928,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13608,7 +13942,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13621,7 +13957,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13633,7 +13971,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13645,7 +13985,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13657,7 +13999,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13669,7 +14013,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13681,7 +14027,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13693,7 +14041,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: @@ -13738,7 +14088,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13751,7 +14103,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13763,7 +14117,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13775,7 +14131,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13788,7 +14146,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13800,7 +14160,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13812,7 +14174,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13824,7 +14188,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13836,7 +14202,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13848,7 +14216,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13860,7 +14230,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: @@ -13905,7 +14277,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13918,7 +14292,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13930,7 +14306,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13942,7 +14320,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13955,7 +14335,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13967,7 +14349,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13979,7 +14363,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -13991,7 +14377,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -14003,7 +14391,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -14015,7 +14405,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -14027,7 +14419,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: @@ -14494,6 +14888,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14511,6 +14906,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14527,6 +14923,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14542,6 +14939,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14558,6 +14956,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14574,6 +14973,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14589,6 +14989,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14604,6 +15005,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14619,6 +15021,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14634,6 +15037,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14649,6 +15053,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14705,7 +15110,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14722,7 +15129,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14738,7 +15147,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14753,7 +15164,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14769,7 +15182,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14785,7 +15200,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14800,7 +15217,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14815,7 +15234,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14830,7 +15251,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14845,7 +15268,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -14860,7 +15285,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -14916,7 +15343,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14933,7 +15362,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14949,7 +15380,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14964,7 +15397,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14980,7 +15415,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14996,7 +15433,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15011,7 +15450,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15026,7 +15467,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15041,7 +15484,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15056,7 +15501,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15071,7 +15518,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15549,7 +15998,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15566,7 +16017,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15582,7 +16035,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15597,7 +16052,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15613,7 +16070,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15629,7 +16088,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15644,7 +16105,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15659,7 +16122,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15674,7 +16139,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15689,7 +16156,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15704,7 +16173,9 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15760,7 +16231,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15777,7 +16250,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15793,7 +16268,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15808,7 +16285,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15824,7 +16303,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15840,7 +16321,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15855,7 +16338,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15870,7 +16355,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15885,7 +16372,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15900,7 +16389,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15915,7 +16406,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15971,7 +16464,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15988,7 +16483,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16004,7 +16501,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16019,7 +16518,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16035,7 +16536,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16051,7 +16554,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16066,7 +16571,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16081,7 +16588,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16096,7 +16605,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16111,7 +16622,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16126,7 +16639,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16182,6 +16697,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16199,6 +16715,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16215,6 +16732,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16230,6 +16748,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16246,6 +16765,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16262,6 +16782,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16277,6 +16798,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16292,6 +16814,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16307,6 +16830,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16322,6 +16846,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16337,6 +16862,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16393,6 +16919,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16410,6 +16937,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16426,6 +16954,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16441,6 +16970,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16457,6 +16987,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16473,6 +17004,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16488,6 +17020,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16503,6 +17036,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16518,6 +17052,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16533,6 +17068,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16548,6 +17084,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16604,7 +17141,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16621,7 +17160,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16637,7 +17178,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16652,7 +17195,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16668,7 +17213,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16684,7 +17231,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16699,7 +17248,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16714,7 +17265,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16729,7 +17282,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16744,7 +17299,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16759,7 +17316,9 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16815,7 +17374,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16832,7 +17393,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16848,7 +17411,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16863,7 +17428,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16879,7 +17446,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16895,7 +17464,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16910,7 +17481,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16925,7 +17498,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16940,7 +17515,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16955,7 +17532,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16970,7 +17549,9 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -17026,7 +17607,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17043,7 +17626,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -17059,7 +17644,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -17074,7 +17661,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -17090,7 +17679,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17106,7 +17697,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17121,7 +17714,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17136,7 +17731,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17151,7 +17748,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17166,7 +17765,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -17181,7 +17782,9 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll index 74a297241d851..7ee83c08fc2b3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -628,6 +628,7 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -655,6 +656,7 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1863,6 +1865,7 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1887,6 +1890,7 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -2037,6 +2041,7 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -2061,6 +2066,7 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -2414,6 +2420,7 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2443,6 +2450,7 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2620,6 +2628,7 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2649,6 +2658,7 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3360,6 +3370,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3388,6 +3399,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -3561,6 +3573,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3589,6 +3602,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4128,6 +4142,7 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4156,6 +4171,7 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4329,6 +4345,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4357,6 +4374,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4530,6 +4548,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4558,6 +4577,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5133,6 +5153,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -5161,6 +5182,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5334,6 +5356,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -5362,6 +5385,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5535,6 +5559,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -5563,6 +5588,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -6407,6 +6433,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6440,6 +6467,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6641,6 +6669,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6674,6 +6703,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7307,6 +7337,7 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7340,6 +7371,7 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7541,6 +7573,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7574,6 +7607,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7775,6 +7809,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7808,6 +7843,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8477,6 +8513,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8510,6 +8547,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8711,6 +8749,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8744,6 +8783,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8945,6 +8985,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8978,6 +9019,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9585,7 +9627,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9599,7 +9643,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9612,7 +9658,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -9624,7 +9672,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -9637,7 +9687,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9650,7 +9702,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9662,7 +9716,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9674,7 +9730,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9686,7 +9744,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9698,7 +9758,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -9710,7 +9772,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -10036,6 +10100,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10047,6 +10112,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10057,6 +10123,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -10067,6 +10134,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -10078,6 +10146,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10088,6 +10157,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10098,6 +10168,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10108,6 +10179,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10118,6 +10190,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10128,6 +10201,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -10138,6 +10212,7 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -10176,6 +10251,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10187,6 +10263,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10197,6 +10274,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -10207,6 +10285,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -10218,6 +10297,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10228,6 +10308,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10238,6 +10319,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10248,6 +10330,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10258,6 +10341,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10268,6 +10352,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -10278,6 +10363,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -10596,6 +10682,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10607,6 +10694,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10617,6 +10705,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -10627,6 +10716,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -10638,6 +10728,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10648,6 +10739,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10658,6 +10750,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10668,6 +10761,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10678,6 +10772,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10688,6 +10783,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -10698,6 +10794,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -10736,7 +10833,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10747,7 +10846,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10757,7 +10858,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10767,7 +10870,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10778,7 +10883,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10788,7 +10895,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10798,7 +10907,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10808,7 +10919,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10818,7 +10931,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10828,7 +10943,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10838,7 +10955,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw: @@ -10876,7 +10995,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10887,7 +11008,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10897,7 +11020,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10907,7 +11032,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10918,7 +11045,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10928,7 +11057,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10938,7 +11069,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10948,7 +11081,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10958,7 +11093,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10968,7 +11105,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -10978,7 +11117,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw: @@ -11199,7 +11340,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11214,7 +11357,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11228,7 +11373,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -11241,7 +11388,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -11255,7 +11404,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11269,7 +11420,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11282,7 +11435,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11295,7 +11450,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11308,7 +11465,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11321,7 +11480,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -11334,7 +11495,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -11382,7 +11545,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11397,7 +11562,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11411,7 +11578,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -11424,7 +11593,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -11438,7 +11609,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11452,7 +11625,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11465,7 +11640,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11478,7 +11655,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11491,7 +11670,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11504,7 +11685,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -11517,7 +11700,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -11901,6 +12086,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -11914,6 +12100,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -11926,6 +12113,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11938,6 +12126,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -11951,6 +12140,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11963,6 +12153,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11975,6 +12166,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -11987,6 +12179,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11999,6 +12192,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -12011,6 +12205,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -12023,6 +12218,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -12068,7 +12264,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12081,7 +12279,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12093,7 +12293,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12105,7 +12307,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12118,7 +12322,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12130,7 +12336,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12142,7 +12350,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12154,7 +12364,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12166,7 +12378,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12178,7 +12392,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12190,7 +12406,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: @@ -12235,7 +12453,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12248,7 +12468,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12260,7 +12482,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12272,7 +12496,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12285,7 +12511,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12297,7 +12525,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12309,7 +12539,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12321,7 +12553,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12333,7 +12567,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12345,7 +12581,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12357,7 +12595,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: @@ -12736,7 +12976,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12749,7 +12991,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12761,7 +13005,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12773,7 +13019,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12786,7 +13034,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12798,7 +13048,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12810,7 +13062,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12822,7 +13076,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12834,7 +13090,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12846,7 +13104,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12858,7 +13118,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg: @@ -12903,7 +13165,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12916,7 +13180,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12928,7 +13194,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12940,7 +13208,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12953,7 +13223,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12965,7 +13237,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12977,7 +13251,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -12989,7 +13265,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -13001,7 +13279,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -13013,7 +13293,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -13025,7 +13307,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: @@ -13070,7 +13354,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13083,7 +13369,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13095,7 +13383,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13107,7 +13397,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13120,7 +13412,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13132,7 +13426,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13144,7 +13440,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13156,7 +13454,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13168,7 +13468,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13180,7 +13482,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13192,7 +13496,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: @@ -13237,6 +13543,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -13250,6 +13557,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -13262,6 +13570,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -13274,6 +13583,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -13287,6 +13597,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13299,6 +13610,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13311,6 +13623,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -13323,6 +13636,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13335,6 +13649,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -13347,6 +13662,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -13359,6 +13675,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -13404,6 +13721,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -13417,6 +13735,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -13429,6 +13748,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -13441,6 +13761,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -13454,6 +13775,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13466,6 +13788,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13478,6 +13801,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -13490,6 +13814,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13502,6 +13827,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -13514,6 +13840,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -13526,6 +13853,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -13571,7 +13899,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13584,7 +13914,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13596,7 +13928,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13608,7 +13942,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13621,7 +13957,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13633,7 +13971,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13645,7 +13985,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13657,7 +13999,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13669,7 +14013,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13681,7 +14027,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13693,7 +14041,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg: @@ -13738,7 +14088,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13751,7 +14103,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13763,7 +14117,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13775,7 +14131,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13788,7 +14146,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13800,7 +14160,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13812,7 +14174,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13824,7 +14188,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13836,7 +14202,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13848,7 +14216,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13860,7 +14230,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: @@ -13905,7 +14277,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13918,7 +14292,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13930,7 +14306,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13942,7 +14320,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13955,7 +14335,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13967,7 +14349,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13979,7 +14363,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -13991,7 +14377,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -14003,7 +14391,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -14015,7 +14405,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -14027,7 +14419,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: @@ -14494,6 +14888,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14511,6 +14906,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14527,6 +14923,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14542,6 +14939,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14558,6 +14956,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14574,6 +14973,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14589,6 +14989,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14604,6 +15005,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14619,6 +15021,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14634,6 +15037,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14649,6 +15053,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14705,7 +15110,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14722,7 +15129,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14738,7 +15147,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14753,7 +15164,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14769,7 +15182,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14785,7 +15200,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14800,7 +15217,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14815,7 +15234,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14830,7 +15251,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14845,7 +15268,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -14860,7 +15285,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -14916,7 +15343,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14933,7 +15362,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14949,7 +15380,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14964,7 +15397,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14980,7 +15415,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14996,7 +15433,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15011,7 +15450,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15026,7 +15467,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15041,7 +15484,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15056,7 +15501,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15071,7 +15518,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15549,7 +15998,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15566,7 +16017,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15582,7 +16035,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15597,7 +16052,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15613,7 +16070,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15629,7 +16088,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15644,7 +16105,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15659,7 +16122,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15674,7 +16139,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15689,7 +16156,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15704,7 +16173,9 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15760,7 +16231,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15777,7 +16250,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15793,7 +16268,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15808,7 +16285,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15824,7 +16303,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15840,7 +16321,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15855,7 +16338,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15870,7 +16355,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15885,7 +16372,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15900,7 +16389,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15915,7 +16406,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15971,7 +16464,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15988,7 +16483,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16004,7 +16501,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16019,7 +16518,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16035,7 +16536,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16051,7 +16554,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16066,7 +16571,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16081,7 +16588,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16096,7 +16605,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16111,7 +16622,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16126,7 +16639,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16182,6 +16697,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16199,6 +16715,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16215,6 +16732,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16230,6 +16748,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16246,6 +16765,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16262,6 +16782,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16277,6 +16798,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16292,6 +16814,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16307,6 +16830,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16322,6 +16846,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16337,6 +16862,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16393,6 +16919,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16410,6 +16937,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16426,6 +16954,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16441,6 +16970,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16457,6 +16987,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16473,6 +17004,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16488,6 +17020,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16503,6 +17036,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16518,6 +17052,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16533,6 +17068,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16548,6 +17084,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16604,7 +17141,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16621,7 +17160,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16637,7 +17178,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16652,7 +17195,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16668,7 +17213,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16684,7 +17231,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16699,7 +17248,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16714,7 +17265,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16729,7 +17282,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16744,7 +17299,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16759,7 +17316,9 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16815,7 +17374,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16832,7 +17393,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16848,7 +17411,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16863,7 +17428,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16879,7 +17446,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16895,7 +17464,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16910,7 +17481,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16925,7 +17498,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16940,7 +17515,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16955,7 +17532,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16970,7 +17549,9 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -17026,7 +17607,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17043,7 +17626,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -17059,7 +17644,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -17074,7 +17661,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -17090,7 +17679,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17106,7 +17697,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17121,7 +17714,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17136,7 +17731,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17151,7 +17748,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17166,7 +17765,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -17181,7 +17782,9 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll index 62d7f4801baf8..b2d569e22d5d2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -628,6 +628,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -655,6 +656,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1863,6 +1865,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -1887,6 +1890,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -2037,6 +2041,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -2061,6 +2066,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -2414,6 +2420,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2443,6 +2450,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2620,6 +2628,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2649,6 +2658,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3360,6 +3370,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3388,6 +3399,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -3561,6 +3573,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -3589,6 +3602,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4128,6 +4142,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4156,6 +4171,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4329,6 +4345,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4357,6 +4374,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -4530,6 +4548,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -4558,6 +4577,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5133,6 +5153,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -5161,6 +5182,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5334,6 +5356,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -5362,6 +5385,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -5535,6 +5559,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -5563,6 +5588,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -6407,6 +6433,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6440,6 +6467,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6641,6 +6669,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6674,6 +6703,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7307,6 +7337,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7340,6 +7371,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7541,6 +7573,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7574,6 +7607,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7775,6 +7809,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7808,6 +7843,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8477,6 +8513,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8510,6 +8547,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8711,6 +8749,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8744,6 +8783,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8945,6 +8985,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -8978,6 +9019,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: buffer_inv sc0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9585,7 +9627,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9599,7 +9643,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9612,7 +9658,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_read_b32 v1, v0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -9624,7 +9672,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_read_b32 v1, v0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -9637,7 +9687,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -9650,7 +9702,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9662,7 +9716,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9674,7 +9730,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9686,7 +9744,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -9698,7 +9758,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_load_b32 v1, v0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -9710,7 +9772,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_load_b32 v1, v0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -10036,6 +10100,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10047,6 +10112,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10057,6 +10123,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -10067,6 +10134,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -10078,6 +10146,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10088,6 +10157,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10098,6 +10168,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10108,6 +10179,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10118,6 +10190,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10128,6 +10201,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -10138,6 +10212,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -10176,6 +10251,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10187,6 +10263,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10197,6 +10274,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -10207,6 +10285,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -10218,6 +10297,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10228,6 +10308,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10238,6 +10319,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10248,6 +10330,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10258,6 +10341,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10268,6 +10352,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -10278,6 +10363,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -10596,6 +10682,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -10607,6 +10694,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -10617,6 +10705,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -10627,6 +10716,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -10638,6 +10728,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -10648,6 +10739,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10658,6 +10750,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -10668,6 +10761,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -10678,6 +10772,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -10688,6 +10783,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -10698,6 +10794,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm ; @@ -10736,7 +10833,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10747,7 +10846,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10757,7 +10858,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10767,7 +10870,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10778,7 +10883,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10788,7 +10895,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10798,7 +10907,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10808,7 +10919,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10818,7 +10931,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10828,7 +10943,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10838,7 +10955,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: @@ -10876,7 +10995,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10887,7 +11008,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10897,7 +11020,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10907,7 +11032,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10918,7 +11045,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10928,7 +11057,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10938,7 +11069,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10948,7 +11081,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10958,7 +11093,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10968,7 +11105,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -10978,7 +11117,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: @@ -11199,7 +11340,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11214,7 +11357,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11228,7 +11373,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -11241,7 +11388,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -11255,7 +11404,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11269,7 +11420,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11282,7 +11435,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11295,7 +11450,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11308,7 +11465,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11321,7 +11480,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -11334,7 +11495,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -11382,7 +11545,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -11397,7 +11562,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11411,7 +11578,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -11424,7 +11593,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -11438,7 +11609,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -11452,7 +11625,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11465,7 +11640,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11478,7 +11655,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11491,7 +11670,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -11504,7 +11685,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -11517,7 +11700,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -11901,6 +12086,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -11914,6 +12100,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -11926,6 +12113,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -11938,6 +12126,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -11951,6 +12140,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -11963,6 +12153,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11975,6 +12166,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -11987,6 +12179,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -11999,6 +12192,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -12011,6 +12205,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -12023,6 +12218,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -12068,7 +12264,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12081,7 +12279,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12093,7 +12293,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12105,7 +12307,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12118,7 +12322,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12130,7 +12336,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12142,7 +12350,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12154,7 +12364,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12166,7 +12378,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12178,7 +12392,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12190,7 +12406,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -12235,7 +12453,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12248,7 +12468,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12260,7 +12482,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12272,7 +12496,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12285,7 +12511,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12297,7 +12525,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12309,7 +12539,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12321,7 +12553,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12333,7 +12567,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12345,7 +12581,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12357,7 +12595,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -12736,7 +12976,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12749,7 +12991,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12761,7 +13005,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12773,7 +13019,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12786,7 +13034,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12798,7 +13048,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12810,7 +13062,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12822,7 +13076,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12834,7 +13090,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12846,7 +13104,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12858,7 +13118,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: @@ -12903,7 +13165,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12916,7 +13180,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12928,7 +13194,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12940,7 +13208,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12953,7 +13223,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12965,7 +13237,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12977,7 +13251,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -12989,7 +13265,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -13001,7 +13279,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -13013,7 +13293,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -13025,7 +13307,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -13070,7 +13354,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13083,7 +13369,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13095,7 +13383,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13107,7 +13397,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13120,7 +13412,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13132,7 +13426,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13144,7 +13440,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13156,7 +13454,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13168,7 +13468,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13180,7 +13482,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13192,7 +13496,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -13237,6 +13543,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -13250,6 +13557,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -13262,6 +13570,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -13274,6 +13583,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -13287,6 +13597,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13299,6 +13610,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13311,6 +13623,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -13323,6 +13636,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13335,6 +13649,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -13347,6 +13662,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -13359,6 +13675,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -13404,6 +13721,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -13417,6 +13735,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -13429,6 +13748,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; @@ -13441,6 +13761,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -13454,6 +13775,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; @@ -13466,6 +13788,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13478,6 +13801,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; @@ -13490,6 +13814,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; @@ -13502,6 +13827,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: s_endpgm ; @@ -13514,6 +13840,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm ; @@ -13526,6 +13853,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm ; @@ -13571,7 +13899,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13584,7 +13914,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13596,7 +13928,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13608,7 +13942,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13621,7 +13957,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13633,7 +13971,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13645,7 +13985,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13657,7 +13999,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13669,7 +14013,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13681,7 +14027,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13693,7 +14041,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: @@ -13738,7 +14088,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13751,7 +14103,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13763,7 +14117,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13775,7 +14131,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13788,7 +14146,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13800,7 +14160,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13812,7 +14174,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13824,7 +14188,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13836,7 +14202,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13848,7 +14216,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13860,7 +14230,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: @@ -13905,7 +14277,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13918,7 +14292,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13930,7 +14306,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13942,7 +14320,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13955,7 +14335,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13967,7 +14349,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13979,7 +14363,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX942-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -13991,7 +14377,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX942-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -14003,7 +14391,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -14015,7 +14405,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -14027,7 +14419,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: s_endpgm ; ; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -14494,6 +14888,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -14511,6 +14906,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -14527,6 +14923,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14542,6 +14939,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14558,6 +14956,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -14574,6 +14973,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14589,6 +14989,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14604,6 +15005,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14619,6 +15021,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14634,6 +15037,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14649,6 +15053,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14705,7 +15110,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14722,7 +15129,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14738,7 +15147,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14753,7 +15164,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14769,7 +15182,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14785,7 +15200,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14800,7 +15217,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14815,7 +15234,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14830,7 +15251,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -14845,7 +15268,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -14860,7 +15285,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -14916,7 +15343,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -14933,7 +15362,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14949,7 +15380,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -14964,7 +15397,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -14980,7 +15415,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -14996,7 +15433,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15011,7 +15450,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15026,7 +15467,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15041,7 +15484,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15056,7 +15501,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15071,7 +15518,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15549,7 +15998,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15566,7 +16017,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15582,7 +16035,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15597,7 +16052,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15613,7 +16070,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15629,7 +16088,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15644,7 +16105,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15659,7 +16122,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15674,7 +16139,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15689,7 +16156,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15704,7 +16173,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15760,7 +16231,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15777,7 +16250,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -15793,7 +16268,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -15808,7 +16285,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -15824,7 +16303,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -15840,7 +16321,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15855,7 +16338,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15870,7 +16355,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15885,7 +16372,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -15900,7 +16389,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -15915,7 +16406,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -15971,7 +16464,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -15988,7 +16483,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16004,7 +16501,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16019,7 +16518,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16035,7 +16536,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16051,7 +16554,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16066,7 +16571,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16081,7 +16588,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16096,7 +16605,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16111,7 +16622,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16126,7 +16639,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16182,6 +16697,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16199,6 +16715,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16215,6 +16732,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16230,6 +16748,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16246,6 +16765,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16262,6 +16782,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16277,6 +16798,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16292,6 +16814,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16307,6 +16830,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16322,6 +16846,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16337,6 +16862,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16393,6 +16919,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 @@ -16410,6 +16937,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -16426,6 +16954,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16441,6 +16970,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16457,6 +16987,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 @@ -16473,6 +17004,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16488,6 +17020,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16503,6 +17036,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16518,6 +17052,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -16533,6 +17068,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -16548,6 +17084,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -16604,7 +17141,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16621,7 +17160,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16637,7 +17178,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16652,7 +17195,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16668,7 +17213,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16684,7 +17231,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16699,7 +17248,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16714,7 +17265,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16729,7 +17282,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16744,7 +17299,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16759,7 +17316,9 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -16815,7 +17374,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -16832,7 +17393,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -16848,7 +17411,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -16863,7 +17428,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -16879,7 +17446,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -16895,7 +17464,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16910,7 +17481,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16925,7 +17498,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16940,7 +17515,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -16955,7 +17532,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -16970,7 +17549,9 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -17026,7 +17607,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -17043,7 +17626,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -17059,7 +17644,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -17074,7 +17661,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -17090,7 +17679,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(15) expcnt(7) lgkmcnt(15) ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -17106,7 +17697,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17121,7 +17714,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s5 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17136,7 +17731,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17151,7 +17748,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(15) ; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-TGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -17166,7 +17765,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -17181,7 +17782,9 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX11-CU-NEXT: s_waitcnt vmcnt(63) expcnt(7) lgkmcnt(63) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir index 56dd95e373dc6..19c249d37d1e8 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir @@ -321,7 +321,9 @@ body: | ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -433,7 +435,9 @@ body: | ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -545,7 +549,9 @@ body: | ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst (s32) from `ptr addrspace(3) poison`, addrspace 3) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -814,6 +820,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -838,6 +845,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -910,6 +918,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -934,6 +943,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -1006,6 +1016,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -1030,6 +1041,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst (s32) into `ptr addrspace(3) poison`, addrspace 3) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir index 36a244f6250db..af2dbed078422 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir @@ -321,7 +321,9 @@ body: | ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -433,7 +435,9 @@ body: | ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -545,7 +549,9 @@ body: | ; GCN-NEXT: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, align 4, addrspace 4) ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst (s32) from `ptr addrspace(2) poison`, addrspace 2) + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec ; GCN-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr poison`) @@ -814,6 +820,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -838,6 +845,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -910,6 +918,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -934,6 +943,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -1006,6 +1016,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release (s32) into `ptr addrspace(2) poison`, addrspace 2) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) @@ -1030,6 +1041,7 @@ body: | ; GCN-NEXT: $m0 = S_MOV_B32 -1 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3967 ; GCN-NEXT: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst (s32) into `ptr addrspace(2) poison`, addrspace 2) ; GCN-NEXT: S_ENDPGM 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)